Repository: z1069614715/objectdetection_script Branch: master Commit: 02ba8c6fb2ad Files: 351 Total size: 2.2 MB Directory structure: gitextract_1c2iago4/ ├── .gitignore ├── Ultralytics-YOLO-project.md ├── bilibili-guide.md ├── cv-attention/ │ ├── A2Attention.py │ ├── BAM.py │ ├── Biformer.py │ ├── CAA.py │ ├── CBAM.py │ ├── CPCA.py │ ├── CloAttention.py │ ├── CoTAttention.py │ ├── CoordAttention.py │ ├── DAttention.py │ ├── ECA.py │ ├── ELA.py │ ├── EMA.py │ ├── EffectiveSE.py │ ├── GAM.py │ ├── GC.py │ ├── GE.py │ ├── LSKA.py │ ├── LSKBlock.py │ ├── MHSA.py │ ├── MLCA.py │ ├── MobileViTAttention.py │ ├── ParNetAttention.py │ ├── PolarizedSelfAttention.py │ ├── S2Attention.py │ ├── SE.py │ ├── SGE.py │ ├── SK.py │ ├── SequentialSelfAttention.py │ ├── ShuffleAttention.py │ ├── SimAM.py │ ├── TripletAttention.py │ └── readme.md ├── cvpr2025-deim-project.md ├── damo-yolo/ │ ├── Annotations/ │ │ └── ReadMe.md │ ├── JPEGImages/ │ │ └── ReadMe.md │ ├── readme.md │ └── voc2coco.py ├── data-offline-aug/ │ ├── object_detection_data_aug.py │ ├── readme.md │ └── segment_data_aug.py ├── mmdet-course/ │ ├── config/ │ │ ├── atss_r50_fpn_dyhead_1x_visdrone.py │ │ ├── cascade-rcnn_r50_fpn_1x_visdrone.py │ │ ├── ddq-detr-4scale_r50_8xb2-12e_visdrone.py │ │ ├── dino-4scale_r50_8xb2-12e_visdrone.py │ │ ├── faster-rcnn_r50_fpn_ciou_1x_visdrone.py │ │ ├── gfl_r50_fpn_1x_visdrone.py │ │ ├── retinanet_r50_fpn_1x_visdrone.py │ │ ├── rtmdet_tiny_8xb32-300e_visdrone.py │ │ ├── tood_r50_fpn_1x_visdrone.py │ │ └── yolox_tiny_8xb8-300e_visdrone.py │ ├── mmdet2yolo.py │ ├── readme.md │ └── yolo2coco.py ├── module-info/ │ ├── CVPR2023-SMPConv.md │ ├── CVPR2024-DCMPNet.md │ ├── CVPR2024-FADC.md │ ├── CVPR2024-PKINet.md │ ├── CVPR2024-ParameterNet.md │ ├── CVPR2024-RMT.md │ ├── CVPR2024-RepVIT.md │ ├── CVPR2024-Rewrite the Stars.md │ ├── CVPR2024-SFSConv.md │ ├── CVPR2024-TransNext.md │ ├── CVPR2024-UniRepLKNet.md │ ├── CVPR2025-BHViT.md │ ├── CVPR2025-DarkIR.md │ ├── CVPR2025-EVSSM.md │ ├── CVPR2025-EfficientViM.md │ ├── CVPR2025-FDConv.md │ ├── CVPR2025-GroupMamba.md │ ├── CVPR2025-LSNet.md │ ├── CVPR2025-MambaIRV2.md │ ├── CVPR2025-MambaOut.md │ ├── CVPR2025-MambaVision.md │ ├── CVPR2025-MobileMamba.md │ ├── CVPR2025-Mona.md │ ├── CVPR2025-OverLoCK.md │ ├── CVPR2025-SCSegamba.md │ ├── CVPR2025-Transformers without Normalization.md │ ├── CVPR2025-vHeat.md │ ├── ICLR2025-Pola.md │ ├── ICLR2025-ToST.md │ └── TPAMI2025-HyperYOLO.md ├── mutilmodel-project.md ├── objectdetection-tricks/ │ ├── readme.md │ ├── tricks_1.py │ ├── tricks_10.py │ ├── tricks_11.py │ ├── tricks_12.py │ ├── tricks_13.py │ ├── tricks_14.py │ ├── tricks_15.py │ ├── tricks_16.py │ ├── tricks_2.py │ ├── tricks_3.py │ ├── tricks_4.py │ ├── tricks_5.py │ ├── tricks_6.py │ ├── tricks_7.py │ ├── tricks_8.py │ └── tricks_9.py ├── readme.md ├── visdrone2019-benchmark/ │ └── readme.md ├── yolo/ │ ├── data.yaml │ ├── dataset/ │ │ ├── VOCdevkit/ │ │ │ ├── Annotations/ │ │ │ │ └── ReadMe.md │ │ │ ├── JPEGImages/ │ │ │ │ └── ReadMe.md │ │ │ └── txt/ │ │ │ └── ReadMe.md │ │ ├── split_data.py │ │ └── xml2txt.py │ └── readme.md ├── yolo-gradcam/ │ ├── README.md │ ├── yolov11_heatmap.py │ ├── yolov5_heatmap.py │ ├── yolov7_heatmap.py │ ├── yolov8_heatmap.py │ └── yolov9_heatmap.py └── yolo-improve/ ├── CAM.py ├── iou.py ├── paper.md ├── readme.md ├── rtdetr-compress.md ├── rtdetr-distill.md ├── rtdetr-project.md ├── ultralytics-yolo/ │ ├── get_COCO_metrice.py │ ├── heatmap.py │ ├── requirements.txt │ ├── train.py │ ├── val.py │ └── yolo2coco.py ├── yolov11-project.md ├── yolov5-AIFI.py ├── yolov5-AUX/ │ ├── benchmarks.py │ ├── data/ │ │ ├── Argoverse.yaml │ │ ├── GlobalWheat2020.yaml │ │ ├── ImageNet.yaml │ │ ├── Objects365.yaml │ │ ├── SKU-110K.yaml │ │ ├── VOC.yaml │ │ ├── VisDrone.yaml │ │ ├── coco.yaml │ │ ├── coco128-seg.yaml │ │ ├── coco128.yaml │ │ ├── hyps/ │ │ │ ├── hyp.Objects365.yaml │ │ │ ├── hyp.VOC.yaml │ │ │ ├── hyp.no-augmentation.yaml │ │ │ ├── hyp.scratch-high.yaml │ │ │ ├── hyp.scratch-low.yaml │ │ │ └── hyp.scratch-med.yaml │ │ ├── scripts/ │ │ │ ├── download_weights.sh │ │ │ ├── get_coco.sh │ │ │ ├── get_coco128.sh │ │ │ └── get_imagenet.sh │ │ └── xView.yaml │ ├── detect.py │ ├── export.py │ ├── hubconf.py │ ├── models/ │ │ ├── __init__.py │ │ ├── common.py │ │ ├── experimental.py │ │ ├── hub/ │ │ │ ├── anchors.yaml │ │ │ ├── yolov3-spp.yaml │ │ │ ├── yolov3-tiny.yaml │ │ │ ├── yolov3.yaml │ │ │ ├── yolov5-bifpn.yaml │ │ │ ├── yolov5-fpn.yaml │ │ │ ├── yolov5-p2.yaml │ │ │ ├── yolov5-p34.yaml │ │ │ ├── yolov5-p6.yaml │ │ │ ├── yolov5-p7.yaml │ │ │ ├── yolov5-panet.yaml │ │ │ ├── yolov5l6.yaml │ │ │ ├── yolov5m6.yaml │ │ │ ├── yolov5n6.yaml │ │ │ ├── yolov5s-LeakyReLU.yaml │ │ │ ├── yolov5s-ghost.yaml │ │ │ ├── yolov5s-transformer.yaml │ │ │ ├── yolov5s6.yaml │ │ │ └── yolov5x6.yaml │ │ ├── segment/ │ │ │ ├── yolov5l-seg.yaml │ │ │ ├── yolov5m-seg.yaml │ │ │ ├── yolov5n-seg.yaml │ │ │ ├── yolov5s-seg.yaml │ │ │ └── yolov5x-seg.yaml │ │ ├── tf.py │ │ ├── yolo.py │ │ ├── yolov5_aux.yaml │ │ ├── yolov5l.yaml │ │ ├── yolov5m.yaml │ │ ├── yolov5n.yaml │ │ ├── yolov5s.yaml │ │ └── yolov5x.yaml │ ├── train.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── augmentations.py │ │ ├── autoanchor.py │ │ ├── autobatch.py │ │ ├── aws/ │ │ │ ├── __init__.py │ │ │ ├── mime.sh │ │ │ ├── resume.py │ │ │ └── userdata.sh │ │ ├── callbacks.py │ │ ├── dataloaders.py │ │ ├── docker/ │ │ │ ├── Dockerfile │ │ │ ├── Dockerfile-arm64 │ │ │ └── Dockerfile-cpu │ │ ├── downloads.py │ │ ├── flask_rest_api/ │ │ │ ├── README.md │ │ │ ├── example_request.py │ │ │ └── restapi.py │ │ ├── general.py │ │ ├── google_app_engine/ │ │ │ ├── Dockerfile │ │ │ ├── additional_requirements.txt │ │ │ └── app.yaml │ │ ├── loggers/ │ │ │ ├── __init__.py │ │ │ ├── clearml/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── clearml_utils.py │ │ │ │ └── hpo.py │ │ │ └── comet/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── comet_utils.py │ │ │ ├── hpo.py │ │ │ └── optimizer_config.json │ │ ├── loss.py │ │ ├── metrics.py │ │ ├── plots.py │ │ ├── segment/ │ │ │ ├── __init__.py │ │ │ ├── augmentations.py │ │ │ ├── dataloaders.py │ │ │ ├── general.py │ │ │ ├── loss.py │ │ │ ├── metrics.py │ │ │ └── plots.py │ │ ├── torch_utils.py │ │ └── triton.py │ └── val.py ├── yolov5-C3RFEM.py ├── yolov5-CARAFE.py ├── yolov5-CCFM.py ├── yolov5-ContextAggregation.py ├── yolov5-CoordConv.py ├── yolov5-DBB.py ├── yolov5-DCN.py ├── yolov5-DCNV3/ │ ├── commod.py │ └── ops_dcnv3/ │ ├── functions/ │ │ ├── __init__.py │ │ └── dcnv3_func.py │ ├── make.sh │ ├── modules/ │ │ ├── __init__.py │ │ └── dcnv3.py │ ├── setup.py │ ├── src/ │ │ ├── cpu/ │ │ │ ├── dcnv3_cpu.cpp │ │ │ └── dcnv3_cpu.h │ │ ├── cuda/ │ │ │ ├── dcnv3_cuda.cu │ │ │ ├── dcnv3_cuda.h │ │ │ └── dcnv3_im2col_cuda.cuh │ │ ├── dcnv3.h │ │ └── vision.cpp │ └── test.py ├── yolov5-DSConv.py ├── yolov5-DecoupledHead.py ├── yolov5-DySnakeConv.py ├── yolov5-EVC.py ├── yolov5-FasterBlock.py ├── yolov5-GFPN/ │ ├── extra_modules.py │ └── yolov5_GFPN.yaml ├── yolov5-GOLDYOLO/ │ ├── common.py │ ├── yolo.py │ ├── yolov5n-goldyolo.yaml │ ├── yolov7-goldyolo.yaml │ └── yolov7-tiny-goldyolo.yaml ├── yolov5-NWD.py ├── yolov5-OTA/ │ └── loss.py ├── yolov5-RepNCSPELAN.py ├── yolov5-SAConv.py ├── yolov5-TSCODE.py ├── yolov5-aLRPLoss.py ├── yolov5-asf.py ├── yolov5-backbone/ │ ├── CVPR2023-EfficientViT/ │ │ └── EfficientViT.py │ ├── CVPR2024-StarNet/ │ │ └── starnet.py │ ├── ConvNextV2/ │ │ └── convnextv2.py │ ├── EMO/ │ │ └── emo.py │ ├── EfficientFormerV2/ │ │ └── EfficientFormerV2.py │ ├── EfficientViT/ │ │ └── efficientViT.py │ ├── FocalNet/ │ │ └── FocalNet.py │ ├── LSKNet/ │ │ └── lsknet.py │ ├── MobileNetV4/ │ │ └── mobilenetv4.py │ ├── NextViT/ │ │ └── NextViT.py │ ├── ODConv/ │ │ ├── od_mobilenetv2.py │ │ ├── od_resnet.py │ │ └── odconv.py │ ├── ODConvFuse/ │ │ ├── od_mobilenetv2.py │ │ ├── od_resnet.py │ │ └── odconv.py │ ├── PoolFormer/ │ │ └── poolformer.py │ ├── RIFormer/ │ │ └── RIFormer.py │ ├── RepViT/ │ │ └── repvit.py │ ├── SwinTransformer/ │ │ └── SwinTransformer.py │ ├── UniRepLKNet/ │ │ └── unireplknet.py │ ├── VanillaNet/ │ │ └── VanillaNet.py │ ├── fasternet/ │ │ ├── faster_cfg/ │ │ │ ├── fasternet_l.yaml │ │ │ ├── fasternet_m.yaml │ │ │ ├── fasternet_s.yaml │ │ │ ├── fasternet_t0.yaml │ │ │ ├── fasternet_t1.yaml │ │ │ └── fasternet_t2.yaml │ │ └── fasternet.py │ ├── inceptionnext/ │ │ └── inceptionnext.py │ ├── main.py │ ├── yolo.py │ └── yolov5-custom.yaml ├── yolov5-dyhead.py ├── yolov5-res2block.py ├── yolov5-softnms.py ├── yolov5v7-light.md ├── yolov7-CoordConv.py ├── yolov7-DBB.py ├── yolov7-DCN.py ├── yolov7-DCNV3.py ├── yolov7-DSConv.py ├── yolov7-DecoupledHead.py ├── yolov7-DySnakeConv.py ├── yolov7-EVC.py ├── yolov7-MPDiou.py ├── yolov7-NWD.py ├── yolov7-PConv.py ├── yolov7-RFEM.py ├── yolov7-RepNCSPELAN.py ├── yolov7-SAConv.py ├── yolov7-asf.py ├── yolov7-head/ │ ├── yolov7-tiny-5-heads.yaml │ ├── yolov7-tiny-P2.yaml │ └── yolov7-tiny-P6.yaml ├── yolov7-iou.py ├── yolov7-odconv.py ├── yolov7-slimneck.py ├── yolov7-softnms.py ├── yolov8-DCN.py ├── yolov8-compress.md ├── yolov8-distill.md ├── yolov8-erf.py ├── yolov8-objectcount.py ├── yolov8-track.py ├── yolov8.py ├── yolov8v10-project.md └── yolov9-backbone/ ├── yolo.py └── yolov9-c-custom.yaml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # Profiling *.pclprof # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv .idea env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # VSCode project settings .vscode/ # Rope project settings .ropeproject # mkdocs documentation /site mkdocs_github_authors.yaml # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # datasets and projects datasets/ runs/ wandb/ tests/ logs/ .DS_Store # Neural Network weights ----------------------------------------------------------------------------------------------- weights/ *.weights *.pt *.pb *.onnx *.engine *.mlmodel *.mlpackage *.torchscript *.tflite *.h5 *_saved_model/ *_web_model/ *_openvino_model/ *_paddle_model/ pnnx* # Autogenerated files for tests /ultralytics/assets/ # dataset cache *.cache ================================================ FILE: Ultralytics-YOLO-project.md ================================================ # Ultralytics-YOLO项目详细说明 1. 本项目集成了YOLOv8、v10、v11、v12乃至前沿的YOLO26等全系列基础模型。 无论是做横向对比实验,还是纵向的版本改进,无需到处找资源,一个项目就能满足你所有的实验需求! 2. 核心代码已实现高度模块化与解耦,专为新手优化。 你完全不需要死磕底层复杂代码,只需像搭积木一样简单修改YAML配置文件,就能轻松实现各种改进模块的自由组合。 3. 面对日益内卷的YOLO赛道,简单的“缝合”已难满足毕业要求。 本项目不仅提供现成的创新方案,更配套独家“二次创新”课程,授人以渔。我们将手把手教你掌握模块设计的底层逻辑,助你从“模仿者”进阶为“创造者”,设计出独属于你的创新模块。 4. 针对有代码基础但受困于Ultralytics复杂架构的同学, 本项目引入了来自DFine、DEIM项目中成熟的“万物皆可融”架构思想。你无需纠结模块注册等信息,只需遵循我所提供的标准接口规范,即可将自定义魔改模块无缝融入YAML配置,与各类CSP变种灵活结合。 5. 实验跑通了,却不知道如何写创新点? 本项目将定期拆解高分论文,传授写作心法,教你如何将实验成果转化为逻辑严密、亮点突出的高质量学术论文,解决写作难题! 6. 毕业设计缺少高大上的展示界面? 别担心,项目会内置基于PyQt或HTML的通用可视化界面,开箱即用,完美补齐毕业论文的最后一块拼图,助你从容应对答辩! 7. 购买即享专属技术交流群, 这里有业内公认的高效答疑服务,以及志同道合的伙伴互助交流。拒绝闭门造车,让我们带你避开深坑,高效通关! ## 针对于已经入手了yolov8/yolo11项目的同学来说,如果你有以下几点需求,可以考虑追加入手! 1. 想用最新的YOLO26做实验!而且本项目支持v8、v10、11、12、26全系列版本! 2. 想深入学习改进创新的同学,本项目会附带二次创新的通用教程,手把手教你设计出属于自己的创新模块! 3. 做完实验不知道怎么写论文?本项目会定期拆解高分论文案例,教你如何把实验结果写成逻辑清晰、亮点突出的高质量学术论文 4. 想自己魔改模块的同学!本项目提供超级简单的模块注册方式,只需按照教程操作,就能轻松注册自己的模块,还能和各种CSP变种随意组合! ## 模块列表(这些模块均已在代码中注册好,只需要修改yaml可以直接实验) - ultralytics/nn/extra_modules/attention 1. ultralytics/nn/extra_modules/attention/SEAM.py 2. CVPR2021|ultralytics/nn/extra_modules/attention/ca.py 3. ICASSP2023|ultralytics/nn/extra_modules/attention/ema.py 4. ICML2021|ultralytics/nn/extra_modules/attention/simam.py 5. ICCV2023|ultralytics/nn/extra_modules/attention/lsk.py 6. WACV2024|ultralytics/nn/extra_modules/attention/DeformableLKA.py 7. ultralytics/nn/extra_modules/attention/mlca.py 8. BIBM2024|ultralytics/nn/extra_modules/attention/FSA.py 9. AAAI2025|ultralytics/nn/extra_modules/attention/CDFA.py 10. TGRS2025|ultralytics/nn/extra_modules/attention/MCA.py 11. CVPR2025|ultralytics/nn/extra_modules/attention/CASAB.py 12. NN2025|ultralytics/nn/extra_modules/attention/KSFA.py 13. TPAMI2025|ultralytics/nn/extra_modules/attention/GQL.py 14. TGRS2025|ultralytics/nn/extra_modules/attention/ACA.py 15. TGRS2025|ultralytics/nn/extra_modules/attention/DHPF.py 16. TGRS2025|ultralytics/nn/extra_modules/attention/ACAB.py - ultralytics/nn/extra_modules/conv_module(此部分内容教程可以看GuideVideo-MG.md中的改进模块-使用教程的第五节) 1. CVPR2021|ultralytics/nn/extra_modules/conv_module/dbb.py 2. TIP2024|ultralytics/nn/extra_modules/conv_module/deconv.py 3. ICCV2023|ultralytics/nn/extra_modules/conv_module/dynamic_snake_conv.py 4. CVPR2023|ultralytics/nn/extra_modules/conv_module/pconv.py 5. AAAI2025|ultralytics/nn/extra_modules/conv_module/psconv.py 6. CVPR2025|ultralytics/nn/extra_modules/conv_module/ShiftwiseConv.py 7. ultralytics/nn/extra_modules/conv_module/wdbb.py 8. ultralytics/nn/extra_modules/conv_module/deepdbb.py 9. ECCV2024|ultralytics/nn/extra_modules/conv_module/wtconv2d.py 10. CVPR2023|ultralytics/nn/extra_modules/conv_module/ScConv.py 11. ultralytics/nn/extra_modules/conv_module/dcnv2.py 12. CVPR2024|ultralytics/nn/extra_modules/conv_module/DilatedReparamConv.py 13. ultralytics/nn/extra_modules/conv_module/gConv.py 14. CVPR2024|ultralytics/nn/extra_modules/conv_module/IDWC.py 15. ultralytics/nn/extra_modules/conv_module/DSA.py 16. CVPR2025|ultralytics/nn/extra_modules/conv_module/FDConv.py 17. CVPR2023|ultralytics/nn/extra_modules/conv_module/dcnv3.py 18. CVPR2024|ultralytics/nn/extra_modules/conv_module/dcnv4.py 19. CVPR2024|ultralytics/nn/extra_modules/conv_module/DynamicConv.py 20. CVPR2024|ultralytics/nn/extra_modules/conv_module/FADC.py 21. CVPR2023|ultralytics/nn/extra_modules/conv_module/SMPConv.py 22. MIA2025|ultralytics/nn/extra_modules/conv_module/FourierConv.py 23. CVPR2024|ultralytics/nn/extra_modules/conv_module/SFSConv.py 24. ICCV2025|ultralytics/nn/extra_modules/conv_module/MBRConv.py 25. ICCV2025|ultralytics/nn/extra_modules/conv_module/ConvAttn.py 26. ICCV2025|ultralytics/nn/extra_modules/conv_module/Converse2D.py 27. CVPR2025|ultralytics/nn/extra_modules/conv_module/gcconv.py 28. ACCV2024|ultralytics/nn/extra_modules/conv_module/RMBC.py 29. CVPR2026|ultralytics/nn/extra_modules/conv_module/DEGConv.py - engine/extre_module/custom_nn/stem 1. ultralytics/nn/extra_modules/stem/SRFD.py 2. ultralytics/nn/extra_modules/stem/LoG.py 3. ICCV2023|ultralytics/nn/extra_modules/stem/RepStem.py - ultralytics/nn/extra_modules/upsample 1. CVPR2024|ultralytics/nn/extra_modules/upsample/eucb.py 2. CVPR2024|ultralytics/nn/extra_modules/upsample/eucb_sc.py 3. ultralytics/nn/extra_modules/upsample/WaveletUnPool.py 4. ICCV2019|ultralytics/nn/extra_modules/upsample/CARAFE.py 5. ICCV2023|ultralytics/nn/extra_modules/upsample/DySample.py 6. ICCV2025|ultralytics/nn/extra_modules/upsample/Converse2D_Up.py 7. CVPR2025|ultralytics/nn/extra_modules/upsample/DSUB.py - ultralytics/nn/extra_modules/downsample 1. TIP2020|ultralytics/nn/extra_modules/downsample/gcnet.py 2. 自研模块|ultralytics/nn/extra_modules/downsample/lawds.py 3. ultralytics/nn/extra_modules/downsample/WaveletPool.py 4. ultralytics/nn/extra_modules/downsample/ADown.py 5. ultralytics/nn/extra_modules/downsample/YOLOV7Down.py 6. ultralytics/nn/extra_modules/downsample/SPDConv.py 7. ultralytics/nn/extra_modules/downsample/HWD.py 8. ultralytics/nn/extra_modules/downsample/DRFD.py 9. TGRS2025|ultralytics/nn/extra_modules/conv_module/FSConv.py - ultralytics/nn/extra_modules/module 1. AAAI2025|ultralytics/nn/extra_modules/module/APBottleneck.py 2. CVPR2025|ultralytics/nn/extra_modules/module/efficientVIM.py 3. CVPR2023|ultralytics/nn/extra_modules/module/fasterblock.py 4. CVPR2024|ultralytics/nn/extra_modules/module/starblock.py 5. ultralytics/nn/extra_modules/module/DWR.py 6. CVPR2024|ultralytics/nn/extra_modules/module/UniRepLKBlock.py 7. CVPR2025|ultralytics/nn/extra_modules/module/mambaout.py 8. AAAI2024|ultralytics/nn/extra_modules/module/DynamicFilter.py 9. ultralytics/nn/extra_modules/module/StripBlock.py 10. TGRS2024|ultralytics/nn/extra_modules/module/elgca.py 11. CVPR2024|ultralytics/nn/extra_modules/module/LEGM.py 12. ICCV2023|ultralytics/nn/extra_modules/module/iRMB.py 13. TPAMI2025|ultralytics/nn/extra_modules/module/MSBlock.py 14. ICLR2024|ultralytics/nn/extra_modules/module/FATBlock.py 15. CVPR2024|ultralytics/nn/extra_modules/module/MSCB.py 16. ultralytics/nn/extra_modules/module/LEGBlock.py 17. ultralytics/nn/extra_modules/module/GLSA.py 18. CVPR2025|ultralytics/nn/extra_modules/module/RCB.py 19. ECCV2024|ultralytics/nn/extra_modules/module/JDPM.py 20. CVPR2025|ultralytics/nn/extra_modules/module/vHeat.py 21. CVPR2025|ultralytics/nn/extra_modules/module/EBlock.py 22. CVPR2025|ultralytics/nn/extra_modules/module/DBlock.py 23. ECCV2024|ultralytics/nn/extra_modules/module/FMB.py 24. CVPR2024|ultralytics/nn/extra_modules/module/IDWB.py 25. ECCV2022|ultralytics/nn/extra_modules/module/LFE.py 26. AAAI2025|ultralytics/nn/extra_modules/module/FCM.py 27. CVPR2024|ultralytics/nn/extra_modules/module/RepViTBlock.py 28. CVPR2024|ultralytics/nn/extra_modules/module/PKIModule.py 29. CVPR2024|ultralytics/nn/extra_modules/module/camixer.py 30. ICCV2025|ultralytics/nn/extra_modules/module/ESC.py 31. CVPR2025|ultralytics/nn/extra_modules/module/nnWNet.py 32. TGRS2025|ultralytics/nn/extra_modules/module/ARF.py 33. AAAI2024|ultralytics/nn/extra_modules/module/CFBlock.py 34. IJCV2024|ultralytics/nn/extra_modules/module/FMA.py 35. ultralytics/nn/extra_modules/module/LWGA.py 36. TGRS2025|ultralytics/nn/extra_modules/module/CSSC.py 37. TGRS2025|ultralytics/nn/extra_modules/module/CNCM.py 38. ICCV2025|ultralytics/nn/extra_modules/module/HFRB.py 39. ICIP2025|ultralytics/nn/extra_modules/module/EVA.py 40. CVPR2025|ultralytics/nn/extra_modules/module/IEL.py 41. MICCAI2023|ultralytics/nn/extra_modules/module/MFEBlock.py 42. AAAI2026|ultralytics/nn/extra_modules/module/PartialNetBlock.py 43. TGRS2025|ultralytics/nn/extra_modules/module/DRG.py 44. ultralytics/nn/extra_modules/module/Wave2D.py 45. TGRS2025|ultralytics/nn/extra_modules/module/GLGM.py 46. TGRS2025|ultralytics/nn/extra_modules/module/MAC.py 47. AAAI2026|ultralytics/nn/extra_modules/module/SPJFB.py - ultralytics/nn/extra_modules/block 1. ultralytics/nn/extra_modules/block/CSPBlock.py 2. TPAMI2025|ultralytics/nn/extra_modules/block/MANet.py 3. TPAMI2024|ultralytics/nn/extra_modules/block/MetaFormer.py - ultralytics/nn/extra_modules/transformer 1. ICLR2025|ultralytics/nn/extra_modules/transformer/PolaLinearAttention.py 2. CVPR2023|ultralytics/nn/extra_modules/transformer/biformer.py 3. CVPR2023|ultralytics/nn/extra_modules/transformer/CascadedGroupAttention.py 4. CVPR2022|ultralytics/nn/extra_modules/transformer/DAttention.py 5. ICLR2022|ultralytics/nn/extra_modules/transformer/DPBAttention.py 6. CVPR2024|ultralytics/nn/extra_modules/transformer/AdaptiveSparseSA.py 7. ultralytics/nn/extra_modules/transformer/GSA.py 8. ultralytics/nn/extra_modules/transformer/RSA.py 9. ECCV2024|ultralytics/nn/extra_modules/transformer/FSSA.py 10. AAAI2025|ultralytics/nn/extra_modules/transformer/DilatedGCSA.py 11. AAAI2025|ultralytics/nn/extra_modules/transformer/DilatedMWSA.py 12. CVPR2024|ultralytics/nn/extra_modules/transformer/SHSA.py 13. IJCAI2024|ultralytics/nn/extra_modules/transformer/CTA.py 14. IJCAI2024|ultralytics/nn/extra_modules/transformer/SFA.py 15. ultralytics/nn/extra_modules/transformer/MSLA.py 16. ACMMM2025|ultralytics/nn/extra_modules/transformer/CPIA_SA.py 17. NN2025|ultralytics/nn/extra_modules/transformer/TokenSelectAttention.py 18. CVPR2025|ultralytics/nn/extra_modules/transformer/TAB.py 19. TPAMI2025|ultralytics/nn/extra_modules/transformer/LRSA.py 20. ICCV2025|ultralytics/nn/extra_modules/transformer/MALA.py 21. ICML2023|ultralytics/nn/extra_modules/transformer/MUA.py 22. ACMMM2025|ultralytics/nn/extra_modules/transformer/EGSA.py 23. ACMMM2025|ultralytics/nn/extra_modules/transformer/SWSA.py 24. AAAI2026|ultralytics/nn/extra_modules/transformer/DHOGSA.py 25. NeurIPS2025|ultralytics/nn/extra_modules/transformer/CBSA.py 26. TGRS2025|ultralytics/nn/extra_modules/transformer/DPWA.py 27. TIP2025|ultralytics/nn/extra_modules/transformer/DWM_MSA.py 28. CVPR2026|ultralytics/nn/extra_modules/transformer/BinaryAttention.py 29. CVPR2025|ultralytics/nn/extra_modules/transformer/wca.py - ultralytics/nn/extra_modules/mamba 1. AAAI2025|ultralytics/nn/extra_modules/mamba/SS2D.py 2. CVPR2025|ultralytics/nn/extra_modules/mamba/ASSM.py 3. CVPR2025|ultralytics/nn/extra_modules/mamba/SAVSS.py 4. CVPR2025|ultralytics/nn/extra_modules/mamba/MobileMamba/mobilemamba.py 5. CVPR2025|ultralytics/nn/extra_modules/mamba/MaIR.py 6. TGRS2025|ultralytics/nn/extra_modules/mamba/GLVSS.py 7. ICCV2025|ultralytics/nn/extra_modules/mamba/VSSD.py 8. ICCV2025|ultralytics/nn/extra_modules/mamba/TinyViM.py 9. INFFUS2025|ultralytics/nn/extra_modules/mamba/CSI.py 10. TIP2025|ultralytics/nn/extra_modules/mamba/SFMB.py 11. TGRS2025|ultralytics/nn/extra_modules/mamba/GLSS.py 12. TGRS2025|ultralytics/nn/extra_modules/mamba/GLSS2D.py 13. CVPR2026|ultralytics/nn/extra_modules/mamba/TransMixer.py - ultralytics/nn/extra_modules/mlp 1. CVPR2024|ultralytics/nn/extra_modules/mlp/ConvolutionalGLU.py 2. IJCAI2024|ultralytics/nn/extra_modules/mlp/DFFN.py 3. ICLR2024|ultralytics/nn/extra_modules/mlp/FMFFN.py 4. CVPR2024|ultralytics/nn/extra_modules/mlp/FRFN.py 5. ECCV2024|ultralytics/nn/extra_modules/mlp/EFFN.py 6. WACV2025|ultralytics/nn/extra_modules/mlp/SEFN.py 7. ICLR2025|ultralytics/nn/extra_modules/mlp/KAN.py 8. CVPR2025|ultralytics/nn/extra_modules/mlp/EDFFN.py 9. ICVJ2024|ultralytics/nn/extra_modules/mlp/DML.py 10. AAAI2026|ultralytics/nn/extra_modules/mlp/DIFF.py - ultralytics/nn/extra_modules/neck 1. ultralytics/nn/extra_modules/neck/ASF.py 2. ultralytics/nn/extra_modules/neck/BiFPN.py 3. AAAI2022|ultralytics/nn/extra_modules/neck/CTrans.py 4. ultralytics/nn/extra_modules/neck/EfficientRepBiPAN.py 5. ultralytics/nn/extra_modules/neck/GFPN.py 6. ultralytics/nn/extra_modules/neck/HSFPN.py 7. AAAI2025|ultralytics/nn/extra_modules/neck/HS_FPN.py 8. TPAMI2025|ultralytics/nn/extra_modules/neck/HyperComputeModule.py 9. ultralytics/nn/extra_modules/neck/SlimNeck.py 10. ultralytics/nn/extra_modules/neck/GoldYOLO.py 11. ultralytics/nn/extra_modules/neck/EMBSFPN.py - ultralytics/nn/extra_modules/featurefusion 1. 自研模块|ultralytics/nn/extra_modules/featurefusion/cgfm.py 2. BMVC2024|ultralytics/nn/extra_modules/featurefusion/msga.py 3. CVPR2024|ultralytics/nn/extra_modules/featurefusion/mfm.py 4. TIP2023|ultralytics/nn/extra_modules/featurefusion/CSFCN.py 5. BIBM2024|ultralytics/nn/extra_modules/featurefusion/mpca.py 6. ACMMM2024|ultralytics/nn/extra_modules/featurefusion/wfu.py 7. CVPR2025|ultralytics/nn/extra_modules/featurefusion/GDSAFusion.py 8. ultralytics/nn/extra_modules/featurefusion/PST.py 9. TGRS2025|ultralytics/nn/extra_modules/featurefusion/MSAM.py 10. INFFUS2025|ultralytics/nn/extra_modules/featurefusion/DPCF.py 11. CVRP2025|ultralytics/nn/extra_modules/featurefusion/LCA.py 12. TGRS2025|ultralytics/nn/extra_modules/featurefusion/HFFE.py 13. TGRS2025|ultralytics/nn/extra_modules/featurefusion/MFPM.py 14. TGRS2025|ultralytics/nn/extra_modules/featurefusion/ERM.py 15. TIP2025|ultralytics/nn/extra_modules/featurefusion/CAFM.py 16. TIP2024|ultralytics/nn/extra_modules/featurefusion/CGAFusion.py 17. IF2023|ultralytics/nn/extra_modules/featurefusion/PSFM.py 18. IF2023|ultralytics/nn/extra_modules/featurefusion/SDFM.py 19. 自研模块|ultralytics/nn/extra_modules/featurefusion/DAF.py 20. 自研模块|ultralytics/nn/extra_modules/featurefusion/CIDAF.py 21. 自研模块|ultralytics/nn/extra_modules/featurefusion/WDAF.py - ultralytics/nn/extra_modules/norm 1. ICML2024|engine/extre_module/custom_nn/transformer/repbn.py 2. CVPR2025|engine/extre_module/custom_nn/transformer/dyt.py 3. engine/extre_module/custom_nn/norm/derf.py - ultralytics/nn/extra_modules/featurepreprocess 1. TGRS2025|ultralytics/nn/extra_modules/featurepreprocess/FAENet.py - ultralytics/nn/extra_modules/head(ultralytics/cfg/models/improve/head) 1. ultralytics/nn/extra_modules/head/LSPCD.py ## Loss 列表 #### 默认配置(兼容) - cls_loss=bce - iou_loss=ciou - iou_aux=none - cls_loss(分类损失) 1. bce 2. slide 3. ema_slide 4. focal 5. varifocal 6. qualityfocal - iou_loss(IoU主损失) 1. 基础形式: iou、giou、diou、ciou、eiou、siou、shapeiou、piou、piou2 2. Inner形式: inner_(例如:inner_diou、inner_ciou、inner_siou) 3. Focaler形式: focaler_(例如:focaler_diou、focaler_ciou、focaler_siou) 4. MPDIoU家族: mpdiou、inner_mpdiou、focaler_mpdiou 5. WiseIoU家族: wiseiou(等价wiseiou_wiou) wiseiou_ wiseiou_inner_ wiseiou_focaler_ 6. wise 可选值: iou、wiou、giou、diou、ciou、eiou、siou、shapeiou、piou、piou2、mpdiou - iou_aux(IoU辅助损失) 1. none 2. gcd 3. nwd ## 更新公告 - 20260217 1. 初版项目发布. 2. 新增使用教程、模块改进使用教程视频. - 20260228 1. 新增常见的cls和iou的损失,并直接支持在train.py里面指定,并且在训练的时候会打印目前的loss. 2. 对模型改进的yaml扩展到yolov8、yolov10、yolo11、yolo12. 3. 新增在训练过程中mAP75输出. 4. 优化detect.py中的特征图保存机制,使其可以单独保存每一个通道的特征图和总通道求和的特征图. 5. 新增毕业必备-基于web的可视化界面,支持选择模型、检测图片、检测视频,显示目标数量等等功能 6. 新增web界面的教程视频. 7. 新增注册module的教程视频. - 20260308 1. 在val.py脚本中增加auto_coco_eval指标,支持一步到位计算COCO指标,不需要再人为转换标签和对齐标签的问题! 2. 新增AAAI2026-SPJFB模块. 3. 新增TGRS2025-GLSS2D模块. 4. 新增TIP2025-CAFM模块. 5. 新增TIP2025-DWM_MSA模块. 6. 新增DynamicERF模块. 7. 新增CSP、MetaFormer、Module在yaml中的使用教程-20260307补充版的视频. 8. 修复用户反馈的bug. - 20260315 1. 新增CVPR2026-DEGConv模块。 2. 新增CVPR2026-BinaryAttention模块。 3. 新增CVPR2026-TransMixer模块。 4. 新增CVPR2025-wca模块。 5. 新增自研模块-DAF模块。 6. 新增自研模块-CIDAF模块。 7. 新增自研模块-WDAF模块。 8. 新增Neck部分内容(ASF、BIFPN、CTrans、ERepBIFPN、GFPN、HSFPN、HS-FPN、超图FPN、SlimNeck、GoldYOLO、EMBSFPN)。 9. 补全attention部分的配置文件。 10. 新增conv、attention的内容如何与CSP模块随意组合的使用教程。 11. 修复用户反馈的bug。 ================================================ FILE: bilibili-guide.md ================================================ # 魔鬼面具-哔哩哔哩视频指南 ### 必看干货系列(建议搞深度学习的小伙伴都看看,特别是图像相关) 1. [深度学习常见实验问题与实验技巧(适用于所有模型,小白初学者必看!)](https://www.bilibili.com/video/BV17j41147j8/) 2. [还在迷茫深度学习中的改进实验应该从哪里开始改起的同学,一定要进来看看了!用自身经验给你推荐实验顺序!](https://www.bilibili.com/video/BV1Nu4y1G7B9/) 3. [探究深度学习中预训练权重对改进和精度的影响!](https://www.bilibili.com/video/BV1FH4y1o7GL/) 4. [什么?你说你不会画模型结构图?行吧,那你进来看看吧,手把手教你画YAML结构图!](https://www.bilibili.com/video/BV1X94y1K76Z/) 5. [探究深度学习中训练中的可重现性](https://www.bilibili.com/video/BV1Nu4y1s7sc/) 6. [什么?你说你更换主干后看不懂配置文件也不懂画结构图?那你快点进来看看了!](https://www.bilibili.com/video/BV1WA4m1V7nQ/) 7. [从三个角度分析,什么条件才算是一个合格的改进专栏!](https://www.bilibili.com/video/BV1E6421g7eb/) 8. [都2024了,你写论文不会还只用p,r,map这些指标分析目标检测模型吧?](https://www.bilibili.com/video/BV1wF4m177JQ/) 9. [从简到难手把手教你画Pytorch模块内的结构图!](https://www.bilibili.com/video/BV1dC411p7H7/) 10. [深度学习论文实验中的其中一大注意点-预训练权重究竟加还是不加?](https://www.bilibili.com/video/BV1Q1421Q7Zw/) 11. [深度学习改进实验必看!基于YOLOV8的WIDER-FACE改进(轻量化+提点)实验思路讲解](https://www.bilibili.com/video/BV1QJ4m1H7DJ/) 12. [YOLOV8-硬塞注意力机制?这样做没创新!想知道注意力怎么用才有创新那赶快来看看!](https://www.bilibili.com/video/BV1bm421K7tf/) 13. [YOLOV8改进-还硬塞注意力机制?这期用注意力机制手把手给大家自研一个ContextGuideFPN!创新真的不难,需要找对方法!](https://www.bilibili.com/video/BV1Vx4y1n7hZ/) 14. [长达46分钟的肺腑之言!给以后想从事图像算法工程师、小白入门深度学习路线的总结!](https://www.bilibili.com/video/BV16y411h7T9/) 15. [提升多少才能发paper?轻量化需要看什么指标?需要轻量化到什么程度才能发paper?这期给大家一一解答!](https://www.bilibili.com/video/BV1QZ421M7gu/) 16. [深度学习实验部分常见疑问解答!(小白刚入门必看!少走弯路!少自我内耗!)](https://www.bilibili.com/video/BV1Bz421B7pC/) ``` 1. 如何衡量自己的所做的工作量够不够? 2. 为什么别人的论文说这个模块对xxx有作用,但是我自己用的时候还掉点了? 3. 提升是和什么模型相比呢 比如和yolov8这种基础模型比还是和别人提出的目前最好的模型比 4. 对比不同的模型的时候,输入尺寸,学习率,学习次数这些是否需要一致? ``` 17. [深度学习实验部分常见疑问解答二!(小白刚入门必看!少走弯路!少自我内耗!)](https://www.bilibili.com/video/BV1ZM4m1m785/) ``` 1. 为什么我用yolov8自带的coco8、coco128训练出来的效果很差? 2. 我的数据集很大,机器跑得慢,我是否可以用数据集的百分之10的数据去测试这个改进点是否有效?有效再跑整个数据集? ``` 18. [深度学习实验部分常见疑问解答三!(怎么判断模型是否收敛?模型过拟合怎么办?)](https://www.bilibili.com/video/BV11S421d76P/) 19. [YOLO系列模型训练结果详细解答!(训练过程的一些疑问,该放哪个文件运行出来的结果、参数量计算量在哪里看..等等问题)](https://www.bilibili.com/video/BV11b421J7Vx/) 20. [细谈目标检测中的小目标检测头和大目标检测检测头,并教懂你怎么加微小目标、极大目标检测头!](https://www.bilibili.com/video/BV1jkDWYFEwx/) 21. [深度学习炼丹必备必看必须知道的小技巧!](https://www.bilibili.com/video/BV1q3SZYsExc/) 22. [深度学习实验准备-数据集怎么选?有哪些需要注意的点?](https://www.bilibili.com/video/BV11zySYvEhs/) 23. [深度学习论文实验中新手非常容易陷入的一个误区:抱着解决xxx问题的心态去做实验](https://www.bilibili.com/video/BV1kkkvYJEHG/) 24. [小目标检测必看系列 | 除了AP-Small指标,可还有AP-VeryTiny、AP-Tiny的指标喔~手把手带你加!](https://www.bilibili.com/video/BV1CYcUeBEzY/) 25. [YOLO中的实例分割原来是这样巧妙地实现的!你在做YOLO-Seg但是又不知道的话,那你要进来看看咯~](https://www.bilibili.com/video/BV1SkP1e1EHC/) 26. [长达30分钟的吐血讲解!为什么别人的纯YOLO小目标检测能上AAAI2025,你的连个最差的都费劲!看看差距在哪里,怎么改善!](https://www.bilibili.com/video/BV14DJazTEtV) 27. [深度学习论文中的基础实验、改进实验、 消融实验、对比实验、泛化实验|这些究竟是什么?](https://www.bilibili.com/video/BV1NYKUz2E6b/) 28. [深度学习论文中的推理结果图、热力图、特征图究竟应该怎么放?需要注意什么?有什么作用?](https://www.bilibili.com/video/BV1s5gQzcEPh/) 29. [YOLO|RTDETR|我会跑Ultralytics了!但是输出的这些都怎么看呀?论文中的结果写什么呀?需要注意什么呀?](https://www.bilibili.com/video/BV1VfbVzHEGM/) ### 服务器租用系列 1. [|DAModel|竟然有一个"不需要装环境就能跑YOLO代码"的服务器平台?让我们一起来看看!](https://www.bilibili.com/video/BV1mg2SYGEGF) 2. [|DAModel|给大家准备好COCO、VOC、VisDrone、CrowdHuman、BDD100K数据集啦~YOLO格式和data.yaml都已配置好~](https://www.bilibili.com/video/BV1UV5qzuEGf) 3. [智算云扉服务器平台|0.99每小时的3090?RTX4090-48GB的显卡?已经配置好的YOLO|RTDETR环境?充值还有额外算力点?标题有限制优势说不完。](https://www.bilibili.com/video/BV11DXTYiENS) ### 必看论文分享系列 1. [有营养的必看论文分享系列一-RTMDet<考虑到精度、速度、部署的2D目标检测网络>](https://www.bilibili.com/video/BV1ab421J77G/) 2. [有营养的必看论文分享系列二-MobileNets<轻量化的开山之作>](https://www.bilibili.com/video/BV1hM4m117JW/) 3. [计算机视觉|YOLO|DETR|2025创新必看的论文之一|MetaFormer(TPAMI2024),选对Baseline是成功的第一步](https://www.bilibili.com/video/BV1W5ATetEg6/) ### 高区论文带读系列 1. [高区论文带读系列一-40分钟长视频带你分析一篇SCI1区的文章,SCI1区也不是触不可及!](https://www.bilibili.com/video/BV1JESuYxEjn/) 2. [高区论文带读系列二-学会捕捉数据集场景下的要害问题是写好文章的第一步!](https://www.bilibili.com/video/BV1XNqjYNEyg/) ### YOLO系列配置文件系列 1. [不会把多个改进整合到一个yaml配置文件里面?那来看看这个吧!从简到难手把手带你整合三个yaml](https://www.bilibili.com/video/BV15H4y1Y7a2/) 2. [细谈目标检测中的小目标检测头和大目标检测检测头,并教懂你怎么加微小目标、极大目标检测头!](https://www.bilibili.com/video/BV1jkDWYFEwx/) 3. [不会看YOLO的模型yaml配置文件?那你还怎么整合多个配置文件!](https://www.bilibili.com/video/BV1oiBRYnEEw/) 4. [不会把多个创新点整合到一个yaml配置文件里面?那来看看这个吧!手把手来你整合创新点!](https://www.bilibili.com/video/BV1DUBRYGE3b/) ### YOLOV5,V7-PYQT5项目讲解 1. [哔哩哔哩合集地址](https://space.bilibili.com/286900343/channel/collectiondetail?sid=917275) 2. [项目github地址](https://github.com/z1069614715/yolov7-pyqt) ### YOLOV5、V7、V8、V9、V10、V11、V12 热力图源码 1. [哔哩哔哩合集地址](https://space.bilibili.com/286900343/channel/collectiondetail?sid=1080305) 2. [项目github地址](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-gradcam) ### YOLO系列模型使用教程系列 1. [YOLOV7保姆级教程](https://www.bilibili.com/video/BV1gD4y1s7zw/?spm_id_from=333.999.0.0) 2. [YOLOV5-Seg实例分割教程](https://www.bilibili.com/video/BV1nV4y1P7HQ/?spm_id_from=333.999.0.0) 3. [YOLOV5-快速上手教程](https://www.bilibili.com/video/BV1tM411a7it/?spm_id_from=333.999.0.0) 4. [YOLOV8-OBB详细教学视频(包含如何把DOTA数据集分割成小图进行训练)](https://www.bilibili.com/video/BV1xK4y117fg/) 5. [EfficientTeacher半监督-详细教学和调参注意事项](https://www.bilibili.com/video/BV1494y1v7hF/) 6. [YOLOV9保姆级别教程来啦~包含环境配置、数据集转换、训练、测试、推理环节~一看就懂!](https://www.bilibili.com/video/BV1d1421z7XW/) 7. [保姆级别YOLOV11-环境配置、 数据集介绍、训练、验证、推理 详细教学视频,看了它,跑YOLOV11 没问题~](https://www.bilibili.com/video/BV1VA11YBELB/) ### YOLOV8V11源码常见疑问解答小课堂 1. [关于配置文件中Optimizer参数为auto的时候,究竟Optimizer会怎么选用呢?](https://www.bilibili.com/video/BV1K34y1w7cZ/) 2. [best.pt究竟是根据什么指标来保存的?](https://www.bilibili.com/video/BV1jN411M7MA/) 3. [数据增强在yolov8中的应用](https://www.bilibili.com/video/BV1aQ4y1g7ah/) 4. [如何添加FPS计算代码和FPS的相关的一些疑问](https://www.bilibili.com/video/BV1Sw411g7DD/) 5. [预测框粗细颜色修改与精度小数位修改](https://www.bilibili.com/video/BV12K421a7rH/) 6. [导出改进/剪枝的onnx模型和讲解onnx-opset和onnxsim的作用](https://www.bilibili.com/video/BV1CK421e7Y3/) 7. [YOLOV8模型详细讲解(包含该如何改进YOLOV8)(刚入门小白,需要改进YOLOV8的同学必看!)](https://www.bilibili.com/video/BV1Ms421u7VH/) 8. [学习率变化问题](https://www.bilibili.com/video/BV1frnferEL1/) ### 目标检测干活系列 1. [深入了解目标检测中的检测头](https://www.bilibili.com/video/BV1AQ4y1j7Cr/) 2. [目标检测中的标签分配策略做了什么?分配过程中的正负样本又是什么?](https://www.bilibili.com/video/BV1Ek4aeUE2J/) ### 环境配置系列教程 1. [保姆式AUTODL-YOLO环境教程(上):从0教你如何配置VSCODE、安装新环境和CUDA和CUDNN、跑通YOLOV8、编译DCNV3](https://www.bilibili.com/video/BV1tT4y1b75q/) 2. [保姆式AUTODL-YOLO环境教程(下):从0教你如何配置VSCODE、安装新环境和CUDA和CUDNN、跑通YOLOV8、编译DCNV3](https://www.bilibili.com/video/BV1nV411Q7mA/) ### 目标检测Tricks 1. [可视化并统计目标检测中的TP,FP,FN](https://www.bilibili.com/video/BV1yM4y1d7Gp/) 2. [深度学习小实验-卷积家族(fps,flops,param)对比实验](https://www.bilibili.com/video/BV1UL411R7Qr/) 3. [yolov5中的FeatureMap可视化(热力图格式)](https://www.bilibili.com/video/BV1LV4y1R7w6/) 4. [用于yolov5和v7中的yolo格式转换coco格式的脚本.](https://www.bilibili.com/video/BV14T411s7Ts/) 5. [Segment Anything演示代码](https://www.bilibili.com/video/BV1hv4y1H7eg/) 6. [固定随机种子在同一个主机上极可能地复现结果](https://www.bilibili.com/video/BV1bh4y1n7Yc/) 7. [计算yolov5推理时间和FPS的脚本](https://www.bilibili.com/video/BV1Uu4y1C714/) 8. [计算yolov7推理时间和FPS的脚本](https://www.bilibili.com/video/BV17p4y177Pe/) 9. [深度学习小实验-YOLO-Block家族(fps,flops,param)对比实验.](https://www.bilibili.com/video/BV17H4y1V7s9/) 10. [输出YOLOV8、RTDETR各个层的计算量和参数量.](https://www.bilibili.com/video/BV1tb421b7aB/) 11. [YOLOV8-不会把PR曲线的数据保存并绘制到一张图?不用怕,手把手教程来啦~](https://www.bilibili.com/video/BV1uC41177oE/) 12. [yolov5、v7、v8、v9、v10曲线对比图、推理时间vs精度对比图绘制手把手教程!](https://www.bilibili.com/video/BV1yf421X7t5/) 13. [YOLOV8-输出每一层的图特征图尺寸和通道数.](https://www.bilibili.com/video/BV1Mz421B7xz/) 14. [YOLOV8V10V11V12更详细的输出精度结果](https://www.bilibili.com/video/BV1dBQDY6Ec5/) 15. [关于数据集的可视化脚本](https://www.bilibili.com/video/BV1k2TizGEnH/) ### MMDet系列教程 1. [一库打尽目标检测对比实验!mmdetection环境、训练、测试手把手教程!](https://www.bilibili.com/video/BV1xA4m1c7H8/) 2. [一库打尽目标检测对比实验!mmdetection参数量、计算量、FPS、绘制logs手把手教程](https://www.bilibili.com/video/BV17C41137dW/) 3. [一库打尽目标检测对比实验!mmdetection指标转换YOLO指标!](https://www.bilibili.com/video/BV1AWtCesEc6/) ### 离线数据增强教程 1. [目标检测数据集离线数据增强教程,包含对目标框、多种变换、天气变化等等增强!](https://www.bilibili.com/video/BV1bT421k7iq/) 2. [语义分割数据集离线数据增强教程,包含对mask、多种变换、天气变化等等增强!](https://www.bilibili.com/video/BV1xi421a7Gb/) 3. [CVPR2025-SaMam|手把手带你用以Mamba为核心的任意风格迁移网络去做数据集扩充!(一个小创新点有了!)](https://www.bilibili.com/video/BV1gWE4z4Eqq/) ### YOLO系列(YOLOV5,YOLOV7,YOLOV8)模型改进大合集 #### YOLOV5(主干系列修改V7同样也适用) 1. [添加EIOU,SIOU,ALPHA-IOU, FocalEIOU到yolov5的box_iou中](https://www.bilibili.com/video/BV1KM411b7Sz/) 2. [Wise-IoU](https://www.bilibili.com/video/BV1tG4y1N7Gk/) 3. [使用DAMO-YOLO中的GFPN替换YOLOV5中的Head](https://www.bilibili.com/video/BV1iR4y1a7bx/) 4. [使用DAMO-YOLO中的GFPN替换YOLOV5中的Head](https://www.bilibili.com/video/BV1iR4y1a7bx/) 5. [使用yolov8中的C2F模块替换yolov5中的C3模块.](https://www.bilibili.com/video/BV1rx4y1g7xt/) 6. [添加Optimal Transport Assignment到yolov5的Loss中](https://www.bilibili.com/video/BV1xD4y1J76n/) 7. [添加Deformable convolution V2到yolov5中](https://www.bilibili.com/video/BV1rT411Q76q/) 8. [添加辅助训练分支到yolov5中](https://www.bilibili.com/video/BV1Fo4y1v7bi/) 9. [添加context augmentation module到yolov5中](https://www.bilibili.com/video/BV17b411d7ef/) 10. [添加SAC到yolov5中](https://www.bilibili.com/video/BV1xD4y1u7NU/) 11. [添加CoordConv到yolov5中](https://www.bilibili.com/video/BV1ng4y1E7rS/) 12. [添加soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU)到yolov5中](https://www.bilibili.com/video/BV1cM41147Ry/) 13. [添加DSConv到yolov5中](https://www.bilibili.com/video/BV1iT411a7Mi/) 14. [添加DCNV3到yolov5中.](https://www.bilibili.com/video/BV1LY411z7iE/) 15. [添加Normalized Gaussian Wasserstein Distance到yolov5中.](https://www.bilibili.com/video/BV1zY4y197UP/) 16. [添加Efficient-DecoupledHead到yolov5中](https://www.bilibili.com/video/BV1mk4y1h7us/) 17. [添加FasterNet中的Faster-Block到yolov5中](https://www.bilibili.com/video/BV1Bs4y1H7Ph/) 18. [添加Timm支持的主干到yolov5中.](https://www.bilibili.com/video/BV1Mx4y1A7jy/) 19. [添加Task-Specific Context Decoupling到yolov5中](https://www.bilibili.com/video/BV1mk4y1h7us/) 20. [添加FasterNet主干到yolov5中](https://www.bilibili.com/video/BV1ra4y1K77u/) 21. [添加Omni-Dimensional Dynamic Convolution主干(od_mobilenetv2,od_resnet)到yolov5中](https://www.bilibili.com/video/BV1Jk4y1v7EW/) 22. [融合Omni-Dimensional Dynamic Convolution主干(od_mobilenetv2,od_resnet)中的Conv和BN](https://www.bilibili.com/video/BV1Rs4y1N7fp/) 23. [添加轻量级上采样算子CARAFE到yolov5中](https://www.bilibili.com/video/BV1kj411c72a/) 24. [添加CFPNet中的EVC-Block到yolov5中](https://www.bilibili.com/video/BV1Pg4y1u7cM/) 25. [添加基于注意力机制的目标检测头(DYHEAD)到yolov5中](https://www.bilibili.com/video/BV1qs4y117Mx/) 26. [添加(2023年New)InceptionNeXt主干到yolov5中](https://www.bilibili.com/video/BV12v4y1H7E1/) 27. [添加aLRPLoss到yolov5中](https://www.bilibili.com/video/BV1YV4y1Z7rV/) 28. [结合Res2Net提出具有多尺度提取能力的C3模块](https://www.bilibili.com/video/BV13X4y167VB/) 29. [添加(2022年)FocalNet(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1ch411L7Dk/) 30. [添加(2023年)EMO(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1Dh4y1J7SV/) 31. [添加(2022年)EfficientFormerV2(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1da4y1g7KT/) 32. [添加(2022年CVPR)PoolFormer(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1eh411c7bz/) 33. [添加(2023年)EfficientViT(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1xk4y1L7Gu/) 34. [添加ContextAggregation到yolov5中](https://www.bilibili.com/video/BV1Yk4y1s7Kx/) 35. [添加(2023年)VanillaNet主干到yolov5中](https://www.bilibili.com/video/BV1os4y1v7Du/) 36. [添加(2022年)NextViT主干到yolov5中](https://www.bilibili.com/video/BV1im4y1i7Ht/) 37. [添加(2023年)RIFormer主干到yolov5中](https://www.bilibili.com/video/BV1bW4y1X7Lo/) 38. [Scale-Aware RFE与C3结合而成的C3RFEM添加到yolov5中](https://www.bilibili.com/video/BV1Gj411D7Pf/) 39. [把重参数结构DiverseBranchBlock与C3融合成C3-DBB添加到yolov5中](https://www.bilibili.com/video/BV1sM4y177Cn/) 40. [添加(2023CVPR)EfficientViT(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1xk4y1L7Gu/) 41. [添加(2023旋转目标检测SOTA)LSKNet主干到yolov5中](https://www.bilibili.com/video/BV1xk4y1L7Gu/) 42. [添加(2023最新IoU度量算法)MPDiou到yolov5中.](https://www.bilibili.com/video/BV19P41147gJ/) 43. [添加Yolo-Face-V2中SlideLoss的到yolov5中](https://www.bilibili.com/video/BV1W14y1i79U/) 44. [添加RepViT(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1PH4y1S7mf/) 45. [利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进YOLOV5中的特征融合模](https://www.bilibili.com/video/BV1PH4y1S7mf/) 46. [利用动态蛇形卷积改进YOLOV5](https://www.bilibili.com/video/BV1Qu411K7Hw/) 47. [利用带有位置信息编码的AIFI自注意力机制改进YOLOV5](https://www.bilibili.com/video/BV1nu4y1h7eS/) 48. [添加UniRepLKNet主干到yolov5中](https://www.bilibili.com/video/BV1PH4y1S7mf/) 49. [添加Attentional Scale Sequence Fusion到yolov5中](https://www.bilibili.com/video/BV1PH4y1S7mf/) 50. [添加cross-scale feature-fusion到yolov5中](https://www.bilibili.com/video/BV1Tb4y1P7yd/) 51. [添加对小目标有效的BiFormer注意力机制到yolov5中](https://www.bilibili.com/video/BV15g4y1g7bM/) 52. [引入最新SOTA(YOLOV9)中的RepNCSPELAN模块](https://www.bilibili.com/video/BV17y421z73k/) #### YOLOV7 1. [添加EIOU,SIOU,ALPHA-IOU, FocalEIOU到yolov5的box_iou中](https://www.bilibili.com/video/BV1zx4y177EF/) 2. [Wise-IoU](https://www.bilibili.com/video/BV1yv4y147kf/) 3. [添加Deformable convolution V2到yolov7中](https://www.bilibili.com/video/BV17R4y1q7vr/) 4. [添加SAC到yolov7中](https://www.bilibili.com/video/BV1xD4y1u7NU/) 5. [添加CoordConv到yolov7中](https://www.bilibili.com/video/BV1K54y1g7ye/) 6. [添加soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU)到yolov7中](https://www.bilibili.com/video/BV1ZY41167iC/) 7. [添加DSConv到yolov7中](https://www.bilibili.com/video/BV1724y1b7PD/) 8. [添加DCNV3到yolov7中.](https://www.bilibili.com/video/BV1mk4y1h7us/) 9. [添加Normalized Gaussian Wasserstein Distance到yolov7中](https://www.bilibili.com/video/BV1kM411H7g1/) 10. [添加具有隐式知识学习的Efficient-DecoupledHead到yolov7中](https://www.bilibili.com/video/BV1tg4y1x7ha/) 11. [添加FasterNet中的PConv到yolov7中](https://www.bilibili.com/video/BV1Z84y137oi/) 12. [添加轻量级上采样算子CARAFE到yolov7中.](https://www.bilibili.com/video/BV1yc411p7wL/) 13. [添加基于注意力机制的目标检测头(DYHEAD)到yolov7中](https://www.bilibili.com/video/BV1Ph4y1s7i9/) 14. [添加Omni-Dimensional Dynamic Convolution到yolov7中](https://www.bilibili.com/video/BV1vh411j71Z/) 15. [添加CFPNet中的EVC-Block到yolov7中](https://www.bilibili.com/video/BV12u4y1f7np/) 16. [P2,P6检测层在YOLOV7中的添加](https://www.bilibili.com/video/BV1LX4y1a72m/) 17. [使用VOVGSCSP轻量化yolov7的Neck](https://www.bilibili.com/video/BV14m4y147PC/) 18. [添加SwinTransformer-Tiny主干到yolov5中](https://www.bilibili.com/video/BV1WX4y1a7ea/) 19. [Scale-Aware RFE添加到yolov7中](https://www.bilibili.com/video/BV1hW4y1D7gQ/) 20. [把重参数结构DiverseBranchBlock添加到yolov7中](https://www.bilibili.com/video/BV14u411b7kL/) 21. [添加(2023最新IoU度量算法)MPDiou到yolov7中](https://www.bilibili.com/video/BV1Qh4y1r7D3/) 22. [利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进YOLOV7中的特征融合模块.](https://www.bilibili.com/video/BV14V411c7H1/) 23. [利用动态蛇形卷积改进YOLOV7](https://www.bilibili.com/video/BV1Wj411x7fq/) 24. [利用带有位置信息编码的AIFI自注意力机制改进YOLOV7](https://www.bilibili.com/video/BV1rj411a7s4/) 25. [添加Attentional Scale Sequence Fusion到yolov7中](https://www.bilibili.com/video/BV1PH4y1S7mf/) 26. [引入最新SOTA(YOLOV9)中的RepNCSPELAN模块](https://www.bilibili.com/video/BV1UA4m137hz/) #### YOLOV8 1. [添加EIOU,SIOU,ALPHA-IOU, FocalEIOU到yolov5,yolov8的box_iou中](https://www.bilibili.com/video/BV1PY4y1o7Hm/) 2. [Wise-IoU](https://www.bilibili.com/video/BV1De4y1N7Mb/) 3. [添加Deformable convolution V2到yolov8中](https://www.bilibili.com/video/BV1Fo4y1i7Mm/) 4. [最新~YOLOV8手把手教学配置文件添加注意力机制!一看就会!](https://www.bilibili.com/video/BV1RH4y1D7CY/) 5. [YOLOV8改进-手把手带你学会注意力机制进阶用法](https://www.bilibili.com/video/BV1ZQ4y1J7oC/) 6. [YOLOV8可视化-可视化并统计每张图的True Positive、False Positive、False Negative](https://www.bilibili.com/video/BV1RA4m1L79K/) 7. [YOLOV8-基于VisDrone的TaskAlignedAssigner任务对齐分配策略的调参实验](https://www.bilibili.com/video/BV1XJ4m1x7eJ/) 8. [YOLOV8-不会把多个改进整合到一个yaml配置文件里面?那来看看这个吧!从简到难手把手带你整合三个yaml](https://www.bilibili.com/video/BV15H4y1Y7a2/) 9. [YOLOV8下游任务系列-一步一步DEBUG保姆式带你完成目标计数](https://www.bilibili.com/video/BV17H4y1J7DD/) 10. [YOLOV8改进-带你分析V8的检测头并重设计10种结构轻量化检测头](https://www.bilibili.com/video/BV1cu411K7FE/) 11. [从CVPR2022-RepLKNet分析有效感受野,并提供YOLOV8可视化感受野的脚本和讲解~](https://www.bilibili.com/video/BV1Gx4y1v7ZZ/) 12. [YOLOV8-不会把PR曲线的数据保存并绘制到一张图?不用怕,手把手教程来啦~](https://www.bilibili.com/video/BV1uC41177oE/) 13. [YOLOV8应用NMS-Free效果怎么样?在Visdrone2019数据集上进行实验,效果不错!后处理时间为0.0ms!](https://www.bilibili.com/video/BV1bt421N7ob/) 14. [YOLOV8-NMSFree|更多公开数据集测试!VisDrone、VOC、PCB](https://www.bilibili.com/video/BV1nZ421x7jr/) 15. [YOLOV8模型详细讲解(包含该如何改进YOLOV8)(刚入门小白,需要改进YOLOV8的同学必看!)](https://www.bilibili.com/video/BV1Ms421u7VH/) #### YOLOV9 1. [YOLOV9-VisDrone实验对比结果来啦!YOLOV9-C模型VisDrone测试集精度为39.7!有兴趣进来看看具体啦!](https://www.bilibili.com/video/BV1Yy42187A3/) 2. [从源码分析YOLOV9比YOLOV7多了什么内容!](https://www.bilibili.com/video/BV1v1421f7rN/) 3. [YOLOV9n VS YOLOV8n,在VisDrone数据集上精度有2.4个点的提升!](https://www.bilibili.com/video/BV16m411f78L/) 4. [YOLOV9改进-更换轻量化王者MobilenetV4-Backbone](https://www.bilibili.com/video/BV1Ax4y1B7Ln/) 5. [YOLOV9改进-CVPR2024-StarNet、DRepCSPELAN](https://www.bilibili.com/video/BV1BU411o7rz/) 6. [YOLOV9改进-CVPR2023-FasterNet以及其FasterBlock、PConv的改进](https://www.bilibili.com/video/BV18y411a74y/) 7. [YOLOV9改进-DySnakeConv动态蛇形卷积、针对长条形不规则物体!](https://www.bilibili.com/video/BV1gi421S77X/) #### YOLOV11 1. [Ultralytics8.3.0沉浸式讲解-YOLOV11针对代码的详细剖析](https://www.bilibili.com/video/BV19XxxeXEma/) 2. [保姆级别YOLOV11-环境配置、 数据集介绍、训练、验证、推理 详细教学视频,看了它,跑YOLOV11 没问题~](https://www.bilibili.com/video/BV1VA11YBELB/) 3. [YOLOV11改进详细分析(改进前必看),每个部分(Backbone、Neck、Head....)有哪些地方可以改进?改进的时候要避免小白三件套!](https://www.bilibili.com/video/BV1GKCdYbEuz/) #### YOLOV13 1. [哎哟你干嘛!YOLO又又又又出新版本了,YOLOV13来了!我们来看看YOLOV13改进了什么,对正在做YOLO改进的同学有什么影响?](https://www.bilibili.com/video/BV1jqKbzGEua/) #### D-Fine-ICLR2025 1. [暴打CVPR2024-RTDETR的D-Fine究竟性能如何?我们一起来训练看看~](https://www.bilibili.com/video/BV1aE6aYHEer/) #### DEIM-CVPR2025 1. [CVPR2025-DEIM|新一代目标检测SOTA|2025发高区论文必备的baseline|训练、测试、10几集的基础改进课程、画图教程系列](https://space.bilibili.com/286900343/lists/4909499) ================================================ FILE: cv-attention/A2Attention.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init from torch.nn import functional as F class DoubleAttention(nn.Module): def __init__(self, in_channels,c_m=128,c_n=128,reconstruct = True): super().__init__() self.in_channels=in_channels self.reconstruct = reconstruct self.c_m=c_m self.c_n=c_n self.convA=nn.Conv2d(in_channels,c_m,1) self.convB=nn.Conv2d(in_channels,c_n,1) self.convV=nn.Conv2d(in_channels,c_n,1) if self.reconstruct: self.conv_reconstruct = nn.Conv2d(c_m, in_channels, kernel_size = 1) self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x): b, c, h,w=x.shape assert c==self.in_channels A=self.convA(x) #b,c_m,h,w B=self.convB(x) #b,c_n,h,w V=self.convV(x) #b,c_n,h,w tmpA=A.view(b,self.c_m,-1) attention_maps=F.softmax(B.view(b,self.c_n,-1)) attention_vectors=F.softmax(V.view(b,self.c_n,-1)) # step 1: feature gating global_descriptors=torch.bmm(tmpA,attention_maps.permute(0,2,1)) #b.c_m,c_n # step 2: feature distribution tmpZ = global_descriptors.matmul(attention_vectors) #b,c_m,h*w tmpZ=tmpZ.view(b,self.c_m,h,w) #b,c_m,h,w if self.reconstruct: tmpZ=self.conv_reconstruct(tmpZ) return tmpZ if __name__ == '__main__': input=torch.randn(50,512,7,7) a2 = DoubleAttention(512) output=a2(input) print(output.shape) ================================================ FILE: cv-attention/BAM.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init def autopad(k, p=None, d=1): # kernel, padding, dilation """Pad to 'same' shape outputs.""" if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Flatten(nn.Module): def forward(self, x): return x.view(x.shape[0], -1) class ChannelAttention(nn.Module): def __init__(self, channel, reduction=16, num_layers=3): super().__init__() self.avgpool = nn.AdaptiveAvgPool2d(1) gate_channels = [channel] gate_channels += [channel // reduction] * num_layers gate_channels += [channel] self.ca = nn.Sequential() self.ca.add_module('flatten', Flatten()) for i in range(len(gate_channels) - 2): self.ca.add_module('fc%d' % i, nn.Linear(gate_channels[i], gate_channels[i + 1])) self.ca.add_module('bn%d' % i, nn.BatchNorm1d(gate_channels[i + 1])) self.ca.add_module('relu%d' % i, nn.ReLU()) self.ca.add_module('last_fc', nn.Linear(gate_channels[-2], gate_channels[-1])) def forward(self, x): res = self.avgpool(x) res = self.ca(res) res = res.unsqueeze(-1).unsqueeze(-1).expand_as(x) return res class SpatialAttention(nn.Module): def __init__(self, channel, reduction=16, num_layers=3, dia_val=2): super().__init__() self.sa = nn.Sequential() self.sa.add_module('conv_reduce1', nn.Conv2d(kernel_size=1, in_channels=channel, out_channels=channel // reduction)) self.sa.add_module('bn_reduce1', nn.BatchNorm2d(channel // reduction)) self.sa.add_module('relu_reduce1', nn.ReLU()) for i in range(num_layers): self.sa.add_module('conv_%d' % i, nn.Conv2d(kernel_size=3, in_channels=channel // reduction, out_channels=channel // reduction, padding=autopad(3, None, dia_val), dilation=dia_val)) self.sa.add_module('bn_%d' % i, nn.BatchNorm2d(channel // reduction)) self.sa.add_module('relu_%d' % i, nn.ReLU()) self.sa.add_module('last_conv', nn.Conv2d(channel // reduction, 1, kernel_size=1)) def forward(self, x): res = self.sa(x) res = res.expand_as(x) return res class BAMBlock(nn.Module): def __init__(self, channel=512, reduction=16, dia_val=2): super().__init__() self.ca = ChannelAttention(channel=channel, reduction=reduction) self.sa = SpatialAttention(channel=channel, reduction=reduction, dia_val=dia_val) self.sigmoid = nn.Sigmoid() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x): b, c, _, _ = x.size() sa_out = self.sa(x) ca_out = self.ca(x) weight = self.sigmoid(sa_out + ca_out) out = (1 + weight) * x return out if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) bam = BAMBlock(channel=512, reduction=16, dia_val=2) output = bam(input) print(output.shape) ================================================ FILE: cv-attention/Biformer.py ================================================ """ Core of BiFormer, Bi-Level Routing Attention. To be refactored. author: ZHU Lei github: https://github.com/rayleizhu email: ray.leizhu@outlook.com This source code is licensed under the license found in the LICENSE file in the root directory of this source tree. """ from typing import Tuple, Optional import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange from torch import Tensor, LongTensor class TopkRouting(nn.Module): """ differentiable topk routing with scaling Args: qk_dim: int, feature dimension of query and key topk: int, the 'topk' qk_scale: int or None, temperature (multiply) of softmax activation with_param: bool, wether inorporate learnable params in routing unit diff_routing: bool, wether make routing differentiable soft_routing: bool, wether make output value multiplied by routing weights """ def __init__(self, qk_dim, topk=4, qk_scale=None, param_routing=False, diff_routing=False): super().__init__() self.topk = topk self.qk_dim = qk_dim self.scale = qk_scale or qk_dim ** -0.5 self.diff_routing = diff_routing # TODO: norm layer before/after linear? self.emb = nn.Linear(qk_dim, qk_dim) if param_routing else nn.Identity() # routing activation self.routing_act = nn.Softmax(dim=-1) def forward(self, query:Tensor, key:Tensor)->Tuple[Tensor]: """ Args: q, k: (n, p^2, c) tensor Return: r_weight, topk_index: (n, p^2, topk) tensor """ if not self.diff_routing: query, key = query.detach(), key.detach() query_hat, key_hat = self.emb(query), self.emb(key) # per-window pooling -> (n, p^2, c) attn_logit = (query_hat*self.scale) @ key_hat.transpose(-2, -1) # (n, p^2, p^2) topk_attn_logit, topk_index = torch.topk(attn_logit, k=self.topk, dim=-1) # (n, p^2, k), (n, p^2, k) r_weight = self.routing_act(topk_attn_logit) # (n, p^2, k) return r_weight, topk_index class KVGather(nn.Module): def __init__(self, mul_weight='none'): super().__init__() assert mul_weight in ['none', 'soft', 'hard'] self.mul_weight = mul_weight def forward(self, r_idx:Tensor, r_weight:Tensor, kv:Tensor): """ r_idx: (n, p^2, topk) tensor r_weight: (n, p^2, topk) tensor kv: (n, p^2, w^2, c_kq+c_v) Return: (n, p^2, topk, w^2, c_kq+c_v) tensor """ # select kv according to routing index n, p2, w2, c_kv = kv.size() topk = r_idx.size(-1) # print(r_idx.size(), r_weight.size()) # FIXME: gather consumes much memory (topk times redundancy), write cuda kernel? topk_kv = torch.gather(kv.view(n, 1, p2, w2, c_kv).expand(-1, p2, -1, -1, -1), # (n, p^2, p^2, w^2, c_kv) without mem cpy dim=2, index=r_idx.view(n, p2, topk, 1, 1).expand(-1, -1, -1, w2, c_kv) # (n, p^2, k, w^2, c_kv) ) if self.mul_weight == 'soft': topk_kv = r_weight.view(n, p2, topk, 1, 1) * topk_kv # (n, p^2, k, w^2, c_kv) elif self.mul_weight == 'hard': raise NotImplementedError('differentiable hard routing TBA') # else: #'none' # topk_kv = topk_kv # do nothing return topk_kv class QKVLinear(nn.Module): def __init__(self, dim, qk_dim, bias=True): super().__init__() self.dim = dim self.qk_dim = qk_dim self.qkv = nn.Linear(dim, qk_dim + qk_dim + dim, bias=bias) def forward(self, x): q, kv = self.qkv(x).split([self.qk_dim, self.qk_dim+self.dim], dim=-1) return q, kv # q, k, v = self.qkv(x).split([self.qk_dim, self.qk_dim, self.dim], dim=-1) # return q, k, v class BiLevelRoutingAttention(nn.Module): """ n_win: number of windows in one side (so the actual number of windows is n_win*n_win) kv_per_win: for kv_downsample_mode='ada_xxxpool' only, number of key/values per window. Similar to n_win, the actual number is kv_per_win*kv_per_win. topk: topk for window filtering param_attention: 'qkvo'-linear for q,k,v and o, 'none': param free attention param_routing: extra linear for routing diff_routing: wether to set routing differentiable soft_routing: wether to multiply soft routing weights """ def __init__(self, dim, n_win=7, num_heads=8, qk_dim=None, qk_scale=None, kv_per_win=4, kv_downsample_ratio=4, kv_downsample_kernel=None, kv_downsample_mode='identity', topk=4, param_attention="qkvo", param_routing=False, diff_routing=False, soft_routing=False, side_dwconv=3, auto_pad=True): super().__init__() # local attention setting self.dim = dim self.n_win = n_win # Wh, Ww self.num_heads = num_heads self.qk_dim = qk_dim or dim assert self.qk_dim % num_heads == 0 and self.dim % num_heads==0, 'qk_dim and dim must be divisible by num_heads!' self.scale = qk_scale or self.qk_dim ** -0.5 ################side_dwconv (i.e. LCE in ShuntedTransformer)########### self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \ lambda x: torch.zeros_like(x) ################ global routing setting ################# self.topk = topk self.param_routing = param_routing self.diff_routing = diff_routing self.soft_routing = soft_routing # router assert not (self.param_routing and not self.diff_routing) # cannot be with_param=True and diff_routing=False self.router = TopkRouting(qk_dim=self.qk_dim, qk_scale=self.scale, topk=self.topk, diff_routing=self.diff_routing, param_routing=self.param_routing) if self.soft_routing: # soft routing, always diffrentiable (if no detach) mul_weight = 'soft' elif self.diff_routing: # hard differentiable routing mul_weight = 'hard' else: # hard non-differentiable routing mul_weight = 'none' self.kv_gather = KVGather(mul_weight=mul_weight) # qkv mapping (shared by both global routing and local attention) self.param_attention = param_attention if self.param_attention == 'qkvo': self.qkv = QKVLinear(self.dim, self.qk_dim) self.wo = nn.Linear(dim, dim) elif self.param_attention == 'qkv': self.qkv = QKVLinear(self.dim, self.qk_dim) self.wo = nn.Identity() else: raise ValueError(f'param_attention mode {self.param_attention} is not surpported!') self.kv_downsample_mode = kv_downsample_mode self.kv_per_win = kv_per_win self.kv_downsample_ratio = kv_downsample_ratio self.kv_downsample_kenel = kv_downsample_kernel if self.kv_downsample_mode == 'ada_avgpool': assert self.kv_per_win is not None self.kv_down = nn.AdaptiveAvgPool2d(self.kv_per_win) elif self.kv_downsample_mode == 'ada_maxpool': assert self.kv_per_win is not None self.kv_down = nn.AdaptiveMaxPool2d(self.kv_per_win) elif self.kv_downsample_mode == 'maxpool': assert self.kv_downsample_ratio is not None self.kv_down = nn.MaxPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity() elif self.kv_downsample_mode == 'avgpool': assert self.kv_downsample_ratio is not None self.kv_down = nn.AvgPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity() elif self.kv_downsample_mode == 'identity': # no kv downsampling self.kv_down = nn.Identity() elif self.kv_downsample_mode == 'fracpool': # assert self.kv_downsample_ratio is not None # assert self.kv_downsample_kenel is not None # TODO: fracpool # 1. kernel size should be input size dependent # 2. there is a random factor, need to avoid independent sampling for k and v raise NotImplementedError('fracpool policy is not implemented yet!') elif kv_downsample_mode == 'conv': # TODO: need to consider the case where k != v so that need two downsample modules raise NotImplementedError('conv policy is not implemented yet!') else: raise ValueError(f'kv_down_sample_mode {self.kv_downsaple_mode} is not surpported!') # softmax for local attention self.attn_act = nn.Softmax(dim=-1) self.auto_pad=auto_pad def forward(self, x, ret_attn_mask=False): """ x: NHWC tensor Return: NHWC tensor """ x = rearrange(x, "n c h w -> n h w c") # NOTE: use padding for semantic segmentation ################################################### if self.auto_pad: N, H_in, W_in, C = x.size() pad_l = pad_t = 0 pad_r = (self.n_win - W_in % self.n_win) % self.n_win pad_b = (self.n_win - H_in % self.n_win) % self.n_win x = F.pad(x, (0, 0, # dim=-1 pad_l, pad_r, # dim=-2 pad_t, pad_b)) # dim=-3 _, H, W, _ = x.size() # padded size else: N, H, W, C = x.size() assert H%self.n_win == 0 and W%self.n_win == 0 # ################################################### # patchify, (n, p^2, w, w, c), keep 2d window as we need 2d pooling to reduce kv size x = rearrange(x, "n (j h) (i w) c -> n (j i) h w c", j=self.n_win, i=self.n_win) #################qkv projection################### # q: (n, p^2, w, w, c_qk) # kv: (n, p^2, w, w, c_qk+c_v) # NOTE: separte kv if there were memory leak issue caused by gather q, kv = self.qkv(x) # pixel-wise qkv # q_pix: (n, p^2, w^2, c_qk) # kv_pix: (n, p^2, h_kv*w_kv, c_qk+c_v) q_pix = rearrange(q, 'n p2 h w c -> n p2 (h w) c') kv_pix = self.kv_down(rearrange(kv, 'n p2 h w c -> (n p2) c h w')) kv_pix = rearrange(kv_pix, '(n j i) c h w -> n (j i) (h w) c', j=self.n_win, i=self.n_win) q_win, k_win = q.mean([2, 3]), kv[..., 0:self.qk_dim].mean([2, 3]) # window-wise qk, (n, p^2, c_qk), (n, p^2, c_qk) ##################side_dwconv(lepe)################## # NOTE: call contiguous to avoid gradient warning when using ddp lepe = self.lepe(rearrange(kv[..., self.qk_dim:], 'n (j i) h w c -> n c (j h) (i w)', j=self.n_win, i=self.n_win).contiguous()) lepe = rearrange(lepe, 'n c (j h) (i w) -> n (j h) (i w) c', j=self.n_win, i=self.n_win) ############ gather q dependent k/v ################# r_weight, r_idx = self.router(q_win, k_win) # both are (n, p^2, topk) tensors kv_pix_sel = self.kv_gather(r_idx=r_idx, r_weight=r_weight, kv=kv_pix) #(n, p^2, topk, h_kv*w_kv, c_qk+c_v) k_pix_sel, v_pix_sel = kv_pix_sel.split([self.qk_dim, self.dim], dim=-1) # kv_pix_sel: (n, p^2, topk, h_kv*w_kv, c_qk) # v_pix_sel: (n, p^2, topk, h_kv*w_kv, c_v) ######### do attention as normal #################### k_pix_sel = rearrange(k_pix_sel, 'n p2 k w2 (m c) -> (n p2) m c (k w2)', m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_kq//m) transpose here? v_pix_sel = rearrange(v_pix_sel, 'n p2 k w2 (m c) -> (n p2) m (k w2) c', m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_v//m) q_pix = rearrange(q_pix, 'n p2 w2 (m c) -> (n p2) m w2 c', m=self.num_heads) # to BMLC tensor (n*p^2, m, w^2, c_qk//m) # param-free multihead attention attn_weight = (q_pix * self.scale) @ k_pix_sel # (n*p^2, m, w^2, c) @ (n*p^2, m, c, topk*h_kv*w_kv) -> (n*p^2, m, w^2, topk*h_kv*w_kv) attn_weight = self.attn_act(attn_weight) out = attn_weight @ v_pix_sel # (n*p^2, m, w^2, topk*h_kv*w_kv) @ (n*p^2, m, topk*h_kv*w_kv, c) -> (n*p^2, m, w^2, c) out = rearrange(out, '(n j i) m (h w) c -> n (j h) (i w) (m c)', j=self.n_win, i=self.n_win, h=H//self.n_win, w=W//self.n_win) out = out + lepe # output linear out = self.wo(out) # NOTE: use padding for semantic segmentation # crop padded region if self.auto_pad and (pad_r > 0 or pad_b > 0): out = out[:, :H_in, :W_in, :].contiguous() if ret_attn_mask: return out, r_weight, r_idx, attn_weight else: return rearrange(out, "n h w c -> n c h w") class Attention(nn.Module): """ vanilla attention """ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x): """ args: x: NCHW tensor return: NCHW tensor """ _, _, H, W = x.size() x = rearrange(x, 'n c h w -> n (h w) c') ####################################### B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) ####################################### x = rearrange(x, 'n (h w) c -> n c h w', h=H, w=W) return x class AttentionLePE(nn.Module): """ vanilla attention """ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., side_dwconv=5): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \ lambda x: torch.zeros_like(x) def forward(self, x): """ args: x: NCHW tensor return: NCHW tensor """ _, _, H, W = x.size() x = rearrange(x, 'n c h w -> n (h w) c') ####################################### B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) lepe = self.lepe(rearrange(x, 'n (h w) c -> n c h w', h=H, w=W)) lepe = rearrange(lepe, 'n c h w -> n (h w) c') attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = x + lepe x = self.proj(x) x = self.proj_drop(x) ####################################### x = rearrange(x, 'n (h w) c -> n c h w', h=H, w=W) return x def _grid2seq(x:Tensor, region_size:Tuple[int], num_heads:int): """ Args: x: BCHW tensor region size: int num_heads: number of attention heads Return: out: rearranged x, has a shape of (bs, nhead, nregion, reg_size, head_dim) region_h, region_w: number of regions per col/row """ B, C, H, W = x.size() region_h, region_w = H//region_size[0], W//region_size[1] x = x.view(B, num_heads, C//num_heads, region_h, region_size[0], region_w, region_size[1]) x = torch.einsum('bmdhpwq->bmhwpqd', x).flatten(2, 3).flatten(-3, -2) # (bs, nhead, nregion, reg_size, head_dim) return x, region_h, region_w def _seq2grid(x:Tensor, region_h:int, region_w:int, region_size:Tuple[int]): """ Args: x: (bs, nhead, nregion, reg_size^2, head_dim) Return: x: (bs, C, H, W) """ bs, nhead, nregion, reg_size_square, head_dim = x.size() x = x.view(bs, nhead, region_h, region_w, region_size[0], region_size[1], head_dim) x = torch.einsum('bmhwpqd->bmdhpwq', x).reshape(bs, nhead*head_dim, region_h*region_size[0], region_w*region_size[1]) return x def regional_routing_attention_torch( query:Tensor, key:Tensor, value:Tensor, scale:float, region_graph:LongTensor, region_size:Tuple[int], kv_region_size:Optional[Tuple[int]]=None, auto_pad=True)->Tensor: """ Args: query, key, value: (B, C, H, W) tensor scale: the scale/temperature for dot product attention region_graph: (B, nhead, h_q*w_q, topk) tensor, topk <= h_k*w_k region_size: region/window size for queries, (rh, rw) key_region_size: optional, if None, key_region_size=region_size auto_pad: required to be true if the input sizes are not divisible by the region_size Return: output: (B, C, H, W) tensor attn: (bs, nhead, q_nregion, reg_size, topk*kv_region_size) attention matrix """ kv_region_size = kv_region_size or region_size bs, nhead, q_nregion, topk = region_graph.size() # Auto pad to deal with any input size q_pad_b, q_pad_r, kv_pad_b, kv_pad_r = 0, 0, 0, 0 if auto_pad: _, _, Hq, Wq = query.size() q_pad_b = (region_size[0] - Hq % region_size[0]) % region_size[0] q_pad_r = (region_size[1] - Wq % region_size[1]) % region_size[1] if (q_pad_b > 0 or q_pad_r > 0): query = F.pad(query, (0, q_pad_r, 0, q_pad_b)) # zero padding _, _, Hk, Wk = key.size() kv_pad_b = (kv_region_size[0] - Hk % kv_region_size[0]) % kv_region_size[0] kv_pad_r = (kv_region_size[1] - Wk % kv_region_size[1]) % kv_region_size[1] if (kv_pad_r > 0 or kv_pad_b > 0): key = F.pad(key, (0, kv_pad_r, 0, kv_pad_b)) # zero padding value = F.pad(value, (0, kv_pad_r, 0, kv_pad_b)) # zero padding # to sequence format, i.e. (bs, nhead, nregion, reg_size, head_dim) query, q_region_h, q_region_w = _grid2seq(query, region_size=region_size, num_heads=nhead) key, _, _ = _grid2seq(key, region_size=kv_region_size, num_heads=nhead) value, _, _ = _grid2seq(value, region_size=kv_region_size, num_heads=nhead) # gather key and values. # TODO: is seperate gathering slower than fused one (our old version) ? # torch.gather does not support broadcasting, hence we do it manually bs, nhead, kv_nregion, kv_region_size, head_dim = key.size() broadcasted_region_graph = region_graph.view(bs, nhead, q_nregion, topk, 1, 1).\ expand(-1, -1, -1, -1, kv_region_size, head_dim) key_g = torch.gather(key.view(bs, nhead, 1, kv_nregion, kv_region_size, head_dim).\ expand(-1, -1, query.size(2), -1, -1, -1), dim=3, index=broadcasted_region_graph) # (bs, nhead, q_nregion, topk, kv_region_size, head_dim) value_g = torch.gather(value.view(bs, nhead, 1, kv_nregion, kv_region_size, head_dim).\ expand(-1, -1, query.size(2), -1, -1, -1), dim=3, index=broadcasted_region_graph) # (bs, nhead, q_nregion, topk, kv_region_size, head_dim) # token-to-token attention # (bs, nhead, q_nregion, reg_size, head_dim) @ (bs, nhead, q_nregion, head_dim, topk*kv_region_size) # -> (bs, nhead, q_nregion, reg_size, topk*kv_region_size) # TODO: mask padding region attn = (query * scale) @ key_g.flatten(-3, -2).transpose(-1, -2) attn = torch.softmax(attn, dim=-1) # (bs, nhead, q_nregion, reg_size, topk*kv_region_size) @ (bs, nhead, q_nregion, topk*kv_region_size, head_dim) # -> (bs, nhead, q_nregion, reg_size, head_dim) output = attn @ value_g.flatten(-3, -2) # to BCHW format output = _seq2grid(output, region_h=q_region_h, region_w=q_region_w, region_size=region_size) # remove paddings if needed if auto_pad and (q_pad_b > 0 or q_pad_r > 0): output = output[:, :, :Hq, :Wq] return output, attn class BiLevelRoutingAttention_nchw(nn.Module): """Bi-Level Routing Attention that takes nchw input Compared to legacy version, this implementation: * removes unused args and components * uses nchw input format to avoid frequent permutation When the size of inputs is not divisible by the region size, there is also a numerical difference than legacy implementation, due to: * different way to pad the input feature map (padding after linear projection) * different pooling behavior (count_include_pad=False) Current implementation is more reasonable, hence we do not keep backward numerical compatiability """ def __init__(self, dim, num_heads=8, n_win=7, qk_scale=None, topk=4, side_dwconv=3, auto_pad=False, attn_backend='torch'): super().__init__() # local attention setting self.dim = dim self.num_heads = num_heads assert self.dim % num_heads == 0, 'dim must be divisible by num_heads!' self.head_dim = self.dim // self.num_heads self.scale = qk_scale or self.dim ** -0.5 # NOTE: to be consistent with old models. ################side_dwconv (i.e. LCE in Shunted Transformer)########### self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \ lambda x: torch.zeros_like(x) ################ regional routing setting ################# self.topk = topk self.n_win = n_win # number of windows per row/col ########################################## self.qkv_linear = nn.Conv2d(self.dim, 3*self.dim, kernel_size=1) self.output_linear = nn.Conv2d(self.dim, self.dim, kernel_size=1) if attn_backend == 'torch': self.attn_fn = regional_routing_attention_torch else: raise ValueError('CUDA implementation is not available yet. Please stay tuned.') def forward(self, x:Tensor, ret_attn_mask=False): """ Args: x: NCHW tensor, better to be channel_last (https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) Return: NCHW tensor """ N, C, H, W = x.size() region_size = (H//self.n_win, W//self.n_win) # STEP 1: linear projection qkv = self.qkv_linear.forward(x) # ncHW q, k, v = qkv.chunk(3, dim=1) # ncHW # STEP 2: region-to-region routing # NOTE: ceil_mode=True, count_include_pad=False = auto padding # NOTE: gradients backward through token-to-token attention. See Appendix A for the intuition. q_r = F.avg_pool2d(q.detach(), kernel_size=region_size, ceil_mode=True, count_include_pad=False) k_r = F.avg_pool2d(k.detach(), kernel_size=region_size, ceil_mode=True, count_include_pad=False) # nchw q_r:Tensor = q_r.permute(0, 2, 3, 1).flatten(1, 2) # n(hw)c k_r:Tensor = k_r.flatten(2, 3) # nc(hw) a_r = q_r @ k_r # n(hw)(hw), adj matrix of regional graph _, idx_r = torch.topk(a_r, k=self.topk, dim=-1) # n(hw)k long tensor idx_r:LongTensor = idx_r.unsqueeze_(1).expand(-1, self.num_heads, -1, -1) # STEP 3: token to token attention (non-parametric function) output, attn_mat = self.attn_fn(query=q, key=k, value=v, scale=self.scale, region_graph=idx_r, region_size=region_size ) output = output + self.lepe(v) # ncHW output = self.output_linear(output) # ncHW if ret_attn_mask: return output, attn_mat return output ================================================ FILE: cv-attention/CAA.py ================================================ import torch.nn as nn def autopad(k, p=None, d=1): # kernel, padding, dilation """Pad to 'same' shape outputs.""" if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv(nn.Module): """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation).""" default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): """Initialize Conv layer with given arguments including activation.""" super().__init__() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): """Apply convolution, batch normalization and activation to input tensor.""" return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): """Perform transposed convolution of 2D data.""" return self.act(self.conv(x)) class CAA(nn.Module): def __init__(self, ch, h_kernel_size = 11, v_kernel_size = 11) -> None: super().__init__() self.avg_pool = nn.AvgPool2d(7, 1, 3) self.conv1 = Conv(ch, ch) self.h_conv = nn.Conv2d(ch, ch, (1, h_kernel_size), 1, (0, h_kernel_size // 2), 1, ch) self.v_conv = nn.Conv2d(ch, ch, (v_kernel_size, 1), 1, (v_kernel_size // 2, 0), 1, ch) self.conv2 = Conv(ch, ch) self.act = nn.Sigmoid() def forward(self, x): attn_factor = self.act(self.conv2(self.v_conv(self.h_conv(self.conv1(self.avg_pool(x)))))) return attn_factor * x ================================================ FILE: cv-attention/CBAM.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init class ChannelAttention(nn.Module): def __init__(self, channel, reduction=16): super().__init__() self.maxpool = nn.AdaptiveMaxPool2d(1) self.avgpool = nn.AdaptiveAvgPool2d(1) self.se = nn.Sequential( nn.Conv2d(channel, channel // reduction, 1, bias=False), nn.ReLU(), nn.Conv2d(channel // reduction, channel, 1, bias=False) ) self.sigmoid = nn.Sigmoid() def forward(self, x): max_result = self.maxpool(x) avg_result = self.avgpool(x) max_out = self.se(max_result) avg_out = self.se(avg_result) output = self.sigmoid(max_out + avg_out) return output class SpatialAttention(nn.Module): def __init__(self, kernel_size=7): super().__init__() self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=kernel_size // 2) self.sigmoid = nn.Sigmoid() def forward(self, x): max_result, _ = torch.max(x, dim=1, keepdim=True) avg_result = torch.mean(x, dim=1, keepdim=True) result = torch.cat([max_result, avg_result], 1) output = self.conv(result) output = self.sigmoid(output) return output class CBAMBlock(nn.Module): def __init__(self, channel=512, reduction=16, kernel_size=7): super().__init__() self.ca = ChannelAttention(channel=channel, reduction=reduction) self.sa = SpatialAttention(kernel_size=kernel_size) def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x): b, c, _, _ = x.size() out = x * self.ca(x) out = out * self.sa(out) return out if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) kernel_size = input.shape[2] cbam = CBAMBlock(channel=512, reduction=16, kernel_size=kernel_size) output = cbam(input) print(output.shape) ================================================ FILE: cv-attention/CPCA.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class CPCA_ChannelAttention(nn.Module): def __init__(self, input_channels, internal_neurons): super(CPCA_ChannelAttention, self).__init__() self.fc1 = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True) self.fc2 = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels, kernel_size=1, stride=1, bias=True) self.input_channels = input_channels def forward(self, inputs): x1 = F.adaptive_avg_pool2d(inputs, output_size=(1, 1)) x1 = self.fc1(x1) x1 = F.relu(x1, inplace=True) x1 = self.fc2(x1) x1 = torch.sigmoid(x1) x2 = F.adaptive_max_pool2d(inputs, output_size=(1, 1)) x2 = self.fc1(x2) x2 = F.relu(x2, inplace=True) x2 = self.fc2(x2) x2 = torch.sigmoid(x2) x = x1 + x2 x = x.view(-1, self.input_channels, 1, 1) return inputs * x class CPCA(nn.Module): def __init__(self, channels, channelAttention_reduce=4): super().__init__() self.ca = CPCA_ChannelAttention(input_channels=channels, internal_neurons=channels // channelAttention_reduce) self.dconv5_5 = nn.Conv2d(channels,channels,kernel_size=5,padding=2,groups=channels) self.dconv1_7 = nn.Conv2d(channels,channels,kernel_size=(1,7),padding=(0,3),groups=channels) self.dconv7_1 = nn.Conv2d(channels,channels,kernel_size=(7,1),padding=(3,0),groups=channels) self.dconv1_11 = nn.Conv2d(channels,channels,kernel_size=(1,11),padding=(0,5),groups=channels) self.dconv11_1 = nn.Conv2d(channels,channels,kernel_size=(11,1),padding=(5,0),groups=channels) self.dconv1_21 = nn.Conv2d(channels,channels,kernel_size=(1,21),padding=(0,10),groups=channels) self.dconv21_1 = nn.Conv2d(channels,channels,kernel_size=(21,1),padding=(10,0),groups=channels) self.conv = nn.Conv2d(channels,channels,kernel_size=(1,1),padding=0) self.act = nn.GELU() def forward(self, inputs): # Global Perceptron inputs = self.conv(inputs) inputs = self.act(inputs) inputs = self.ca(inputs) x_init = self.dconv5_5(inputs) x_1 = self.dconv1_7(x_init) x_1 = self.dconv7_1(x_1) x_2 = self.dconv1_11(x_init) x_2 = self.dconv11_1(x_2) x_3 = self.dconv1_21(x_init) x_3 = self.dconv21_1(x_3) x = x_1 + x_2 + x_3 + x_init spatial_att = self.conv(x) out = spatial_att * inputs out = self.conv(out) return out ================================================ FILE: cv-attention/CloAttention.py ================================================ import torch import torch.nn as nn from efficientnet_pytorch.model import MemoryEfficientSwish class AttnMap(nn.Module): def __init__(self, dim): super().__init__() self.act_block = nn.Sequential( nn.Conv2d(dim, dim, 1, 1, 0), MemoryEfficientSwish(), nn.Conv2d(dim, dim, 1, 1, 0) ) def forward(self, x): return self.act_block(x) class EfficientAttention(nn.Module): def __init__(self, dim, num_heads=8, group_split=[4, 4], kernel_sizes=[5], window_size=4, attn_drop=0., proj_drop=0., qkv_bias=True): super().__init__() assert sum(group_split) == num_heads assert len(kernel_sizes) + 1 == len(group_split) self.dim = dim self.num_heads = num_heads self.dim_head = dim // num_heads self.scalor = self.dim_head ** -0.5 self.kernel_sizes = kernel_sizes self.window_size = window_size self.group_split = group_split convs = [] act_blocks = [] qkvs = [] #projs = [] for i in range(len(kernel_sizes)): kernel_size = kernel_sizes[i] group_head = group_split[i] if group_head == 0: continue convs.append(nn.Conv2d(3*self.dim_head*group_head, 3*self.dim_head*group_head, kernel_size, 1, kernel_size//2, groups=3*self.dim_head*group_head)) act_blocks.append(AttnMap(self.dim_head*group_head)) qkvs.append(nn.Conv2d(dim, 3*group_head*self.dim_head, 1, 1, 0, bias=qkv_bias)) #projs.append(nn.Linear(group_head*self.dim_head, group_head*self.dim_head, bias=qkv_bias)) if group_split[-1] != 0: self.global_q = nn.Conv2d(dim, group_split[-1]*self.dim_head, 1, 1, 0, bias=qkv_bias) self.global_kv = nn.Conv2d(dim, group_split[-1]*self.dim_head*2, 1, 1, 0, bias=qkv_bias) #self.global_proj = nn.Linear(group_split[-1]*self.dim_head, group_split[-1]*self.dim_head, bias=qkv_bias) self.avgpool = nn.AvgPool2d(window_size, window_size) if window_size!=1 else nn.Identity() self.convs = nn.ModuleList(convs) self.act_blocks = nn.ModuleList(act_blocks) self.qkvs = nn.ModuleList(qkvs) self.proj = nn.Conv2d(dim, dim, 1, 1, 0, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj_drop = nn.Dropout(proj_drop) def high_fre_attntion(self, x: torch.Tensor, to_qkv: nn.Module, mixer: nn.Module, attn_block: nn.Module): ''' x: (b c h w) ''' b, c, h, w = x.size() qkv = to_qkv(x) #(b (3 m d) h w) qkv = mixer(qkv).reshape(b, 3, -1, h, w).transpose(0, 1).contiguous() #(3 b (m d) h w) q, k, v = qkv #(b (m d) h w) attn = attn_block(q.mul(k)).mul(self.scalor) attn = self.attn_drop(torch.tanh(attn)) res = attn.mul(v) #(b (m d) h w) return res def low_fre_attention(self, x : torch.Tensor, to_q: nn.Module, to_kv: nn.Module, avgpool: nn.Module): ''' x: (b c h w) ''' b, c, h, w = x.size() q = to_q(x).reshape(b, -1, self.dim_head, h*w).transpose(-1, -2).contiguous() #(b m (h w) d) kv = avgpool(x) #(b c h w) kv = to_kv(kv).view(b, 2, -1, self.dim_head, (h*w)//(self.window_size**2)).permute(1, 0, 2, 4, 3).contiguous() #(2 b m (H W) d) k, v = kv #(b m (H W) d) attn = self.scalor * q @ k.transpose(-1, -2) #(b m (h w) (H W)) attn = self.attn_drop(attn.softmax(dim=-1)) res = attn @ v #(b m (h w) d) res = res.transpose(2, 3).reshape(b, -1, h, w).contiguous() return res def forward(self, x: torch.Tensor): ''' x: (b c h w) ''' res = [] for i in range(len(self.kernel_sizes)): if self.group_split[i] == 0: continue res.append(self.high_fre_attntion(x, self.qkvs[i], self.convs[i], self.act_blocks[i])) if self.group_split[-1] != 0: res.append(self.low_fre_attention(x, self.global_q, self.global_kv, self.avgpool)) return self.proj_drop(self.proj(torch.cat(res, dim=1))) ================================================ FILE: cv-attention/CoTAttention.py ================================================ import numpy as np import torch from torch import flatten, nn from torch.nn import init from torch.nn.modules.activation import ReLU from torch.nn.modules.batchnorm import BatchNorm2d from torch.nn import functional as F class CoTAttention(nn.Module): def __init__(self, dim=512, kernel_size=3): super().__init__() self.dim = dim self.kernel_size = kernel_size self.key_embed = nn.Sequential( nn.Conv2d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=4, bias=False), nn.BatchNorm2d(dim), nn.ReLU() ) self.value_embed = nn.Sequential( nn.Conv2d(dim, dim, 1, bias=False), nn.BatchNorm2d(dim) ) factor = 4 self.attention_embed = nn.Sequential( nn.Conv2d(2 * dim, 2 * dim // factor, 1, bias=False), nn.BatchNorm2d(2 * dim // factor), nn.ReLU(), nn.Conv2d(2 * dim // factor, kernel_size * kernel_size * dim, 1) ) def forward(self, x): bs, c, h, w = x.shape k1 = self.key_embed(x) # bs,c,h,w v = self.value_embed(x).view(bs, c, -1) # bs,c,h,w y = torch.cat([k1, x], dim=1) # bs,2c,h,w att = self.attention_embed(y) # bs,c*k*k,h,w att = att.reshape(bs, c, self.kernel_size * self.kernel_size, h, w) att = att.mean(2, keepdim=False).view(bs, c, -1) # bs,c,h*w k2 = F.softmax(att, dim=-1) * v k2 = k2.view(bs, c, h, w) return k1 + k2 if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) cot = CoTAttention(dim=512, kernel_size=3) output = cot(input) print(output.shape) ================================================ FILE: cv-attention/CoordAttention.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class h_sigmoid(nn.Module): def __init__(self, inplace=True): super(h_sigmoid, self).__init__() self.relu = nn.ReLU6(inplace=inplace) def forward(self, x): return self.relu(x + 3) / 6 class h_swish(nn.Module): def __init__(self, inplace=True): super(h_swish, self).__init__() self.sigmoid = h_sigmoid(inplace=inplace) def forward(self, x): return x * self.sigmoid(x) class CoordAtt(nn.Module): def __init__(self, inp, reduction=32): super(CoordAtt, self).__init__() self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) self.pool_w = nn.AdaptiveAvgPool2d((1, None)) mip = max(8, inp // reduction) self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0) self.bn1 = nn.BatchNorm2d(mip) self.act = h_swish() self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0) def forward(self, x): identity = x n, c, h, w = x.size() x_h = self.pool_h(x) x_w = self.pool_w(x).permute(0, 1, 3, 2) y = torch.cat([x_h, x_w], dim=2) y = self.conv1(y) y = self.bn1(y) y = self.act(y) x_h, x_w = torch.split(y, [h, w], dim=2) x_w = x_w.permute(0, 1, 3, 2) a_h = self.conv_h(x_h).sigmoid() a_w = self.conv_w(x_w).sigmoid() out = identity * a_w * a_h return out if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) pna = CoordAtt(inp=512) output = pna(input) print(output.shape) ================================================ FILE: cv-attention/DAttention.py ================================================ import torch, einops import torch.nn as nn import torch.nn.functional as F import numpy as np from timm.models.layers import trunc_normal_ class LayerNormProxy(nn.Module): def __init__(self, dim): super().__init__() self.norm = nn.LayerNorm(dim) def forward(self, x): x = einops.rearrange(x, 'b c h w -> b h w c') x = self.norm(x) return einops.rearrange(x, 'b h w c -> b c h w') class DAttention(nn.Module): # Vision Transformer with Deformable Attention CVPR2022 # fixed_pe=True need adujust 640x640 def __init__( self, channel, q_size, n_heads=8, n_groups=4, attn_drop=0.0, proj_drop=0.0, stride=1, offset_range_factor=4, use_pe=True, dwc_pe=True, no_off=False, fixed_pe=False, ksize=3, log_cpb=False, kv_size=None ): super().__init__() n_head_channels = channel // n_heads self.dwc_pe = dwc_pe self.n_head_channels = n_head_channels self.scale = self.n_head_channels ** -0.5 self.n_heads = n_heads self.q_h, self.q_w = q_size # self.kv_h, self.kv_w = kv_size self.kv_h, self.kv_w = self.q_h // stride, self.q_w // stride self.nc = n_head_channels * n_heads self.n_groups = n_groups self.n_group_channels = self.nc // self.n_groups self.n_group_heads = self.n_heads // self.n_groups self.use_pe = use_pe self.fixed_pe = fixed_pe self.no_off = no_off self.offset_range_factor = offset_range_factor self.ksize = ksize self.log_cpb = log_cpb self.stride = stride kk = self.ksize pad_size = kk // 2 if kk != stride else 0 self.conv_offset = nn.Sequential( nn.Conv2d(self.n_group_channels, self.n_group_channels, kk, stride, pad_size, groups=self.n_group_channels), LayerNormProxy(self.n_group_channels), nn.GELU(), nn.Conv2d(self.n_group_channels, 2, 1, 1, 0, bias=False) ) if self.no_off: for m in self.conv_offset.parameters(): m.requires_grad_(False) self.proj_q = nn.Conv2d( self.nc, self.nc, kernel_size=1, stride=1, padding=0 ) self.proj_k = nn.Conv2d( self.nc, self.nc, kernel_size=1, stride=1, padding=0 ) self.proj_v = nn.Conv2d( self.nc, self.nc, kernel_size=1, stride=1, padding=0 ) self.proj_out = nn.Conv2d( self.nc, self.nc, kernel_size=1, stride=1, padding=0 ) self.proj_drop = nn.Dropout(proj_drop, inplace=True) self.attn_drop = nn.Dropout(attn_drop, inplace=True) if self.use_pe and not self.no_off: if self.dwc_pe: self.rpe_table = nn.Conv2d( self.nc, self.nc, kernel_size=3, stride=1, padding=1, groups=self.nc) elif self.fixed_pe: self.rpe_table = nn.Parameter( torch.zeros(self.n_heads, self.q_h * self.q_w, self.kv_h * self.kv_w) ) trunc_normal_(self.rpe_table, std=0.01) elif self.log_cpb: # Borrowed from Swin-V2 self.rpe_table = nn.Sequential( nn.Linear(2, 32, bias=True), nn.ReLU(inplace=True), nn.Linear(32, self.n_group_heads, bias=False) ) else: self.rpe_table = nn.Parameter( torch.zeros(self.n_heads, self.q_h * 2 - 1, self.q_w * 2 - 1) ) trunc_normal_(self.rpe_table, std=0.01) else: self.rpe_table = None @torch.no_grad() def _get_ref_points(self, H_key, W_key, B, dtype, device): ref_y, ref_x = torch.meshgrid( torch.linspace(0.5, H_key - 0.5, H_key, dtype=dtype, device=device), torch.linspace(0.5, W_key - 0.5, W_key, dtype=dtype, device=device), indexing='ij' ) ref = torch.stack((ref_y, ref_x), -1) ref[..., 1].div_(W_key - 1.0).mul_(2.0).sub_(1.0) ref[..., 0].div_(H_key - 1.0).mul_(2.0).sub_(1.0) ref = ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2 return ref @torch.no_grad() def _get_q_grid(self, H, W, B, dtype, device): ref_y, ref_x = torch.meshgrid( torch.arange(0, H, dtype=dtype, device=device), torch.arange(0, W, dtype=dtype, device=device), indexing='ij' ) ref = torch.stack((ref_y, ref_x), -1) ref[..., 1].div_(W - 1.0).mul_(2.0).sub_(1.0) ref[..., 0].div_(H - 1.0).mul_(2.0).sub_(1.0) ref = ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2 return ref def forward(self, x): B, C, H, W = x.size() dtype, device = x.dtype, x.device q = self.proj_q(x) q_off = einops.rearrange(q, 'b (g c) h w -> (b g) c h w', g=self.n_groups, c=self.n_group_channels) offset = self.conv_offset(q_off).contiguous() # B * g 2 Hg Wg Hk, Wk = offset.size(2), offset.size(3) n_sample = Hk * Wk if self.offset_range_factor >= 0 and not self.no_off: offset_range = torch.tensor([1.0 / (Hk - 1.0), 1.0 / (Wk - 1.0)], device=device).reshape(1, 2, 1, 1) offset = offset.tanh().mul(offset_range).mul(self.offset_range_factor) offset = einops.rearrange(offset, 'b p h w -> b h w p') reference = self._get_ref_points(Hk, Wk, B, dtype, device) if self.no_off: offset = offset.fill_(0.0) if self.offset_range_factor >= 0: pos = offset + reference else: pos = (offset + reference).clamp(-1., +1.) if self.no_off: x_sampled = F.avg_pool2d(x, kernel_size=self.stride, stride=self.stride) assert x_sampled.size(2) == Hk and x_sampled.size(3) == Wk, f"Size is {x_sampled.size()}" else: pos = pos.type(x.dtype) x_sampled = F.grid_sample( input=x.reshape(B * self.n_groups, self.n_group_channels, H, W), grid=pos[..., (1, 0)], # y, x -> x, y mode='bilinear', align_corners=True) # B * g, Cg, Hg, Wg x_sampled = x_sampled.reshape(B, C, 1, n_sample) q = q.reshape(B * self.n_heads, self.n_head_channels, H * W) k = self.proj_k(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample) v = self.proj_v(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample) attn = torch.einsum('b c m, b c n -> b m n', q, k) # B * h, HW, Ns attn = attn.mul(self.scale) if self.use_pe and (not self.no_off): if self.dwc_pe: residual_lepe = self.rpe_table(q.reshape(B, C, H, W)).reshape(B * self.n_heads, self.n_head_channels, H * W) elif self.fixed_pe: rpe_table = self.rpe_table attn_bias = rpe_table[None, ...].expand(B, -1, -1, -1) attn = attn + attn_bias.reshape(B * self.n_heads, H * W, n_sample) elif self.log_cpb: q_grid = self._get_q_grid(H, W, B, dtype, device) displacement = (q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul(4.0) # d_y, d_x [-8, +8] displacement = torch.sign(displacement) * torch.log2(torch.abs(displacement) + 1.0) / np.log2(8.0) attn_bias = self.rpe_table(displacement) # B * g, H * W, n_sample, h_g attn = attn + einops.rearrange(attn_bias, 'b m n h -> (b h) m n', h=self.n_group_heads) else: rpe_table = self.rpe_table rpe_bias = rpe_table[None, ...].expand(B, -1, -1, -1) q_grid = self._get_q_grid(H, W, B, dtype, device) displacement = (q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul(0.5) attn_bias = F.grid_sample( input=einops.rearrange(rpe_bias, 'b (g c) h w -> (b g) c h w', c=self.n_group_heads, g=self.n_groups), grid=displacement[..., (1, 0)], mode='bilinear', align_corners=True) # B * g, h_g, HW, Ns attn_bias = attn_bias.reshape(B * self.n_heads, H * W, n_sample) attn = attn + attn_bias attn = F.softmax(attn, dim=2) attn = self.attn_drop(attn) out = torch.einsum('b m n, b c n -> b c m', attn, v) if self.use_pe and self.dwc_pe: out = out + residual_lepe out = out.reshape(B, C, H, W) y = self.proj_drop(self.proj_out(out)) return y ================================================ FILE: cv-attention/ECA.py ================================================ import torch, math from torch import nn class EfficientChannelAttention(nn.Module): # Efficient Channel Attention module def __init__(self, c, b=1, gamma=2): super(EfficientChannelAttention, self).__init__() t = int(abs((math.log(c, 2) + b) / gamma)) k = t if t % 2 else t + 1 self.avg_pool = nn.AdaptiveAvgPool2d(1) self.conv1 = nn.Conv1d(1, 1, kernel_size=k, padding=int(k/2), bias=False) self.sigmoid = nn.Sigmoid() def forward(self, x): out = self.avg_pool(x) out = self.conv1(out.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1) out = self.sigmoid(out) return out * x if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) eca = EfficientChannelAttention(c=512) output = eca(input) print(output.shape) ================================================ FILE: cv-attention/ELA.py ================================================ import torch.nn as nn class ELA(nn.Module): def __init__(self, channels) -> None: super().__init__() self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) self.pool_w = nn.AdaptiveAvgPool2d((1, None)) self.conv1x1 = nn.Sequential( nn.Conv1d(channels, channels, 1), nn.GroupNorm(16, channels), nn.Sigmoid() ) def forward(self, x): b, c, h, w = x.size() x_h = self.conv1x1(self.pool_h(x).reshape((b, c, h))).reshape((b, c, h, 1)) x_w = self.conv1x1(self.pool_w(x).reshape((b, c, w))).reshape((b, c, 1, w)) return x * x_h * x_w ================================================ FILE: cv-attention/EMA.py ================================================ import torch from torch import nn class EMA(nn.Module): def __init__(self, channels, factor=8): super(EMA, self).__init__() self.groups = factor assert channels // self.groups > 0 self.softmax = nn.Softmax(-1) self.agp = nn.AdaptiveAvgPool2d((1, 1)) self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) self.pool_w = nn.AdaptiveAvgPool2d((1, None)) self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups) self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0) self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1) def forward(self, x): b, c, h, w = x.size() group_x = x.reshape(b * self.groups, -1, h, w) # b*g,c//g,h,w x_h = self.pool_h(group_x) x_w = self.pool_w(group_x).permute(0, 1, 3, 2) hw = self.conv1x1(torch.cat([x_h, x_w], dim=2)) x_h, x_w = torch.split(hw, [h, w], dim=2) x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid()) x2 = self.conv3x3(group_x) x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x12 = x2.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x22 = x1.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w) return (group_x * weights.sigmoid()).reshape(b, c, h, w) ================================================ FILE: cv-attention/EffectiveSE.py ================================================ import torch from torch import nn as nn from timm.models.layers.create_act import create_act_layer class EffectiveSEModule(nn.Module): def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid'): super(EffectiveSEModule, self).__init__() self.add_maxpool = add_maxpool self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0) self.gate = create_act_layer(gate_layer) def forward(self, x): x_se = x.mean((2, 3), keepdim=True) if self.add_maxpool: # experimental codepath, may remove or change x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True) x_se = self.fc(x_se) return x * self.gate(x_se) if __name__ == '__main__': input=torch.randn(50,512,7,7) Ese = EffectiveSEModule(512) output=Ese(input) print(output.shape) ================================================ FILE: cv-attention/GAM.py ================================================ import torch.nn as nn import torch class GAM_Attention(nn.Module): def __init__(self, in_channels, rate=4): super(GAM_Attention, self).__init__() self.channel_attention = nn.Sequential( nn.Linear(in_channels, int(in_channels / rate)), nn.ReLU(inplace=True), nn.Linear(int(in_channels / rate), in_channels) ) self.spatial_attention = nn.Sequential( nn.Conv2d(in_channels, int(in_channels / rate), kernel_size=7, padding=3), nn.BatchNorm2d(int(in_channels / rate)), nn.ReLU(inplace=True), nn.Conv2d(int(in_channels / rate), in_channels, kernel_size=7, padding=3), nn.BatchNorm2d(in_channels) ) def forward(self, x): b, c, h, w = x.shape x_permute = x.permute(0, 2, 3, 1).view(b, -1, c) x_att_permute = self.channel_attention(x_permute).view(b, h, w, c) x_channel_att = x_att_permute.permute(0, 3, 1, 2).sigmoid() x = x * x_channel_att x_spatial_att = self.spatial_attention(x).sigmoid() out = x * x_spatial_att return out if __name__ == '__main__': x = torch.randn(1, 64, 20, 20) b, c, h, w = x.shape net = GAM_Attention(in_channels=c) y = net(x) print(y.size()) ================================================ FILE: cv-attention/GC.py ================================================ import torch from torch import nn as nn import torch.nn.functional as F from timm.models.layers.create_act import create_act_layer, get_act_layer from timm.models.layers import make_divisible from timm.models.layers.mlp import ConvMlp from timm.models.layers.norm import LayerNorm2d class GlobalContext(nn.Module): def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False, rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'): super(GlobalContext, self).__init__() act_layer = get_act_layer(act_layer) self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None if rd_channels is None: rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) if fuse_add: self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) else: self.mlp_add = None if fuse_scale: self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) else: self.mlp_scale = None self.gate = create_act_layer(gate_layer) self.init_last_zero = init_last_zero self.reset_parameters() def reset_parameters(self): if self.conv_attn is not None: nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu') if self.mlp_add is not None: nn.init.zeros_(self.mlp_add.fc2.weight) def forward(self, x): B, C, H, W = x.shape if self.conv_attn is not None: attn = self.conv_attn(x).reshape(B, 1, H * W) # (B, 1, H * W) attn = F.softmax(attn, dim=-1).unsqueeze(3) # (B, 1, H * W, 1) context = x.reshape(B, C, H * W).unsqueeze(1) @ attn context = context.view(B, C, 1, 1) else: context = x.mean(dim=(2, 3), keepdim=True) if self.mlp_scale is not None: mlp_x = self.mlp_scale(context) x = x * self.gate(mlp_x) if self.mlp_add is not None: mlp_x = self.mlp_add(context) x = x + mlp_x return x if __name__ == '__main__': input=torch.randn(50,512,7,7) gc = GlobalContext(512) output=gc(input) print(output.shape) ================================================ FILE: cv-attention/GE.py ================================================ import math, torch from torch import nn as nn import torch.nn.functional as F from timm.models.layers.create_act import create_act_layer, get_act_layer from timm.models.layers.create_conv2d import create_conv2d from timm.models.layers import make_divisible from timm.models.layers.mlp import ConvMlp class GatherExcite(nn.Module): def __init__( self, channels, feat_size=None, extra_params=False, extent=0, use_mlp=True, rd_ratio=1./16, rd_channels=None, rd_divisor=1, add_maxpool=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, gate_layer='sigmoid'): super(GatherExcite, self).__init__() self.add_maxpool = add_maxpool act_layer = get_act_layer(act_layer) self.extent = extent if extra_params: self.gather = nn.Sequential() if extent == 0: assert feat_size is not None, 'spatial feature size must be specified for global extent w/ params' self.gather.add_module( 'conv1', create_conv2d(channels, channels, kernel_size=feat_size, stride=1, depthwise=True)) if norm_layer: self.gather.add_module(f'norm1', nn.BatchNorm2d(channels)) else: assert extent % 2 == 0 num_conv = int(math.log2(extent)) for i in range(num_conv): self.gather.add_module( f'conv{i + 1}', create_conv2d(channels, channels, kernel_size=3, stride=2, depthwise=True)) if norm_layer: self.gather.add_module(f'norm{i + 1}', nn.BatchNorm2d(channels)) if i != num_conv - 1: self.gather.add_module(f'act{i + 1}', act_layer(inplace=True)) else: self.gather = None if self.extent == 0: self.gk = 0 self.gs = 0 else: assert extent % 2 == 0 self.gk = self.extent * 2 - 1 self.gs = self.extent if not rd_channels: rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) self.mlp = ConvMlp(channels, rd_channels, act_layer=act_layer) if use_mlp else nn.Identity() self.gate = create_act_layer(gate_layer) def forward(self, x): size = x.shape[-2:] if self.gather is not None: x_ge = self.gather(x) else: if self.extent == 0: # global extent x_ge = x.mean(dim=(2, 3), keepdims=True) if self.add_maxpool: # experimental codepath, may remove or change x_ge = 0.5 * x_ge + 0.5 * x.amax((2, 3), keepdim=True) else: x_ge = F.avg_pool2d( x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2, count_include_pad=False) if self.add_maxpool: # experimental codepath, may remove or change x_ge = 0.5 * x_ge + 0.5 * F.max_pool2d(x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2) x_ge = self.mlp(x_ge) if x_ge.shape[-1] != 1 or x_ge.shape[-2] != 1: x_ge = F.interpolate(x_ge, size=size) return x * self.gate(x_ge) if __name__ == '__main__': input=torch.randn(50,512,7,7) GE = GatherExcite(512) output=GE(input) print(output.shape) ================================================ FILE: cv-attention/LSKA.py ================================================ import torch.nn as nn class LSKA(nn.Module): # Large-Separable-Kernel-Attention # https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention/tree/main def __init__(self, dim, k_size=7): super().__init__() self.k_size = k_size if k_size == 7: self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 3), stride=(1,1), padding=(0,(3-1)//2), groups=dim) self.conv0v = nn.Conv2d(dim, dim, kernel_size=(3, 1), stride=(1,1), padding=((3-1)//2,0), groups=dim) self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 3), stride=(1,1), padding=(0,2), groups=dim, dilation=2) self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(3, 1), stride=(1,1), padding=(2,0), groups=dim, dilation=2) elif k_size == 11: self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 3), stride=(1,1), padding=(0,(3-1)//2), groups=dim) self.conv0v = nn.Conv2d(dim, dim, kernel_size=(3, 1), stride=(1,1), padding=((3-1)//2,0), groups=dim) self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,4), groups=dim, dilation=2) self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=(4,0), groups=dim, dilation=2) elif k_size == 23: self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim) self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim) self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 7), stride=(1,1), padding=(0,9), groups=dim, dilation=3) self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(7, 1), stride=(1,1), padding=(9,0), groups=dim, dilation=3) elif k_size == 35: self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim) self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim) self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 11), stride=(1,1), padding=(0,15), groups=dim, dilation=3) self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(11, 1), stride=(1,1), padding=(15,0), groups=dim, dilation=3) elif k_size == 41: self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim) self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim) self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 13), stride=(1,1), padding=(0,18), groups=dim, dilation=3) self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(13, 1), stride=(1,1), padding=(18,0), groups=dim, dilation=3) elif k_size == 53: self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim) self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim) self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 17), stride=(1,1), padding=(0,24), groups=dim, dilation=3) self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(17, 1), stride=(1,1), padding=(24,0), groups=dim, dilation=3) self.conv1 = nn.Conv2d(dim, dim, 1) def forward(self, x): u = x.clone() attn = self.conv0h(x) attn = self.conv0v(attn) attn = self.conv_spatial_h(attn) attn = self.conv_spatial_v(attn) attn = self.conv1(attn) return u * attn ================================================ FILE: cv-attention/LSKBlock.py ================================================ import torch import torch.nn as nn class LSKblock(nn.Module): def __init__(self, dim): super().__init__() self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) self.conv_spatial = nn.Conv2d(dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3) self.conv1 = nn.Conv2d(dim, dim//2, 1) self.conv2 = nn.Conv2d(dim, dim//2, 1) self.conv_squeeze = nn.Conv2d(2, 2, 7, padding=3) self.conv = nn.Conv2d(dim//2, dim, 1) def forward(self, x): attn1 = self.conv0(x) attn2 = self.conv_spatial(attn1) attn1 = self.conv1(attn1) attn2 = self.conv2(attn2) attn = torch.cat([attn1, attn2], dim=1) avg_attn = torch.mean(attn, dim=1, keepdim=True) max_attn, _ = torch.max(attn, dim=1, keepdim=True) agg = torch.cat([avg_attn, max_attn], dim=1) sig = self.conv_squeeze(agg).sigmoid() attn = attn1 * sig[:,0,:,:].unsqueeze(1) + attn2 * sig[:,1,:,:].unsqueeze(1) attn = self.conv(attn) return x * attn ================================================ FILE: cv-attention/MHSA.py ================================================ import torch import torch.nn as nn class MHSA(nn.Module): def __init__(self, n_dims, width=14, height=14, heads=4, pos_emb=False): super(MHSA, self).__init__() self.heads = heads self.query = nn.Conv2d(n_dims, n_dims, kernel_size=1) self.key = nn.Conv2d(n_dims, n_dims, kernel_size=1) self.value = nn.Conv2d(n_dims, n_dims, kernel_size=1) self.pos = pos_emb if self.pos: self.rel_h_weight = nn.Parameter(torch.randn([1, heads, (n_dims) // heads, 1, int(height)]), requires_grad=True) self.rel_w_weight = nn.Parameter(torch.randn([1, heads, (n_dims) // heads, int(width), 1]), requires_grad=True) self.softmax = nn.Softmax(dim=-1) def forward(self, x): n_batch, C, width, height = x.size() q = self.query(x).view(n_batch, self.heads, C // self.heads, -1) k = self.key(x).view(n_batch, self.heads, C // self.heads, -1) v = self.value(x).view(n_batch, self.heads, C // self.heads, -1) content_content = torch.matmul(q.permute(0, 1, 3, 2), k) # 1,C,h*w,h*w c1, c2, c3, c4 = content_content.size() if self.pos: content_position = (self.rel_h_weight + self.rel_w_weight).view(1, self.heads, C // self.heads, -1).permute( 0, 1, 3, 2) # 1,4,1024,64 content_position = torch.matmul(content_position, q) # ([1, 4, 1024, 256]) content_position = content_position if ( content_content.shape == content_position.shape) else content_position[:, :, :c3, ] assert (content_content.shape == content_position.shape) energy = content_content + content_position else: energy = content_content attention = self.softmax(energy) out = torch.matmul(v, attention.permute(0, 1, 3, 2)) # 1,4,256,64 out = out.view(n_batch, C, width, height) return out if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) mhsa = MHSA(n_dims=512) output = mhsa(input) print(output.shape) ================================================ FILE: cv-attention/MLCA.py ================================================ import math, torch from torch import nn import torch.nn.functional as F class MLCA(nn.Module): def __init__(self, in_size, local_size=5, gamma = 2, b = 1,local_weight=0.5): super(MLCA, self).__init__() # ECA 计算方法 self.local_size=local_size self.gamma = gamma self.b = b t = int(abs(math.log(in_size, 2) + self.b) / self.gamma) # eca gamma=2 k = t if t % 2 else t + 1 self.conv = nn.Conv1d(1, 1, kernel_size=k, padding=(k - 1) // 2, bias=False) self.conv_local = nn.Conv1d(1, 1, kernel_size=k, padding=(k - 1) // 2, bias=False) self.local_weight=local_weight self.local_arv_pool = nn.AdaptiveAvgPool2d(local_size) self.global_arv_pool=nn.AdaptiveAvgPool2d(1) def forward(self, x): local_arv=self.local_arv_pool(x) global_arv=self.global_arv_pool(local_arv) b,c,m,n = x.shape b_local, c_local, m_local, n_local = local_arv.shape # (b,c,local_size,local_size) -> (b,c,local_size*local_size) -> (b,local_size*local_size,c) -> (b,1,local_size*local_size*c) temp_local= local_arv.view(b, c_local, -1).transpose(-1, -2).reshape(b, 1, -1) # (b,c,1,1) -> (b,c,1) -> (b,1,c) temp_global = global_arv.view(b, c, -1).transpose(-1, -2) y_local = self.conv_local(temp_local) y_global = self.conv(temp_global) # (b,c,local_size,local_size) <- (b,c,local_size*local_size)<-(b,local_size*local_size,c) <- (b,1,local_size*local_size*c) y_local_transpose=y_local.reshape(b, self.local_size * self.local_size,c).transpose(-1,-2).view(b, c, self.local_size , self.local_size) # (b,1,c) -> (b,c,1) -> (b,c,1,1) y_global_transpose = y_global.transpose(-1,-2).unsqueeze(-1) # 反池化 att_local = y_local_transpose.sigmoid() att_global = F.adaptive_avg_pool2d(y_global_transpose.sigmoid(),[self.local_size, self.local_size]) att_all = F.adaptive_avg_pool2d(att_global*(1-self.local_weight)+(att_local*self.local_weight), [m, n]) x = x * att_all return x if __name__ == '__main__': attention = MLCA(in_size=256) inputs = torch.randn((2, 256, 16, 16)) result = attention(inputs) print(result.size()) ================================================ FILE: cv-attention/MobileViTAttention.py ================================================ from torch import nn import torch from einops import rearrange class PreNorm(nn.Module): def __init__(self, dim, fn): super().__init__() self.ln = nn.LayerNorm(dim) self.fn = fn def forward(self, x, **kwargs): return self.fn(self.ln(x), **kwargs) class FeedForward(nn.Module): def __init__(self, dim, mlp_dim, dropout): super().__init__() self.net = nn.Sequential( nn.Linear(dim, mlp_dim), nn.SiLU(), nn.Dropout(dropout), nn.Linear(mlp_dim, dim), nn.Dropout(dropout) ) def forward(self, x): return self.net(x) class Attention(nn.Module): def __init__(self, dim, heads, head_dim, dropout): super().__init__() inner_dim = heads * head_dim project_out = not (heads == 1 and head_dim == dim) self.heads = heads self.scale = head_dim ** -0.5 self.attend = nn.Softmax(dim=-1) self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) self.to_out = nn.Sequential( nn.Linear(inner_dim, dim), nn.Dropout(dropout) ) if project_out else nn.Identity() def forward(self, x): qkv = self.to_qkv(x).chunk(3, dim=-1) q, k, v = map(lambda t: rearrange(t, 'b p n (h d) -> b p h n d', h=self.heads), qkv) dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale attn = self.attend(dots) out = torch.matmul(attn, v) out = rearrange(out, 'b p h n d -> b p n (h d)') return self.to_out(out) class Transformer(nn.Module): def __init__(self, dim, depth, heads, head_dim, mlp_dim, dropout=0.): super().__init__() self.layers = nn.ModuleList([]) for _ in range(depth): self.layers.append(nn.ModuleList([ PreNorm(dim, Attention(dim, heads, head_dim, dropout)), PreNorm(dim, FeedForward(dim, mlp_dim, dropout)) ])) def forward(self, x): out = x for att, ffn in self.layers: out = out + att(out) out = out + ffn(out) return out class MobileViTAttention(nn.Module): def __init__(self, in_channel=3, dim=512, kernel_size=3, patch_size=7): super().__init__() self.ph, self.pw = patch_size, patch_size self.conv1 = nn.Conv2d(in_channel, in_channel, kernel_size=kernel_size, padding=kernel_size // 2) self.conv2 = nn.Conv2d(in_channel, dim, kernel_size=1) self.trans = Transformer(dim=dim, depth=3, heads=8, head_dim=64, mlp_dim=1024) self.conv3 = nn.Conv2d(dim, in_channel, kernel_size=1) self.conv4 = nn.Conv2d(2 * in_channel, in_channel, kernel_size=kernel_size, padding=kernel_size // 2) def forward(self, x): y = x.clone() # bs,c,h,w ## Local Representation y = self.conv2(self.conv1(x)) # bs,dim,h,w ## Global Representation _, _, h, w = y.shape y = rearrange(y, 'bs dim (nh ph) (nw pw) -> bs (ph pw) (nh nw) dim', ph=self.ph, pw=self.pw) # bs,h,w,dim y = self.trans(y) y = rearrange(y, 'bs (ph pw) (nh nw) dim -> bs dim (nh ph) (nw pw)', ph=self.ph, pw=self.pw, nh=h // self.ph, nw=w // self.pw) # bs,dim,h,w ## Fusion y = self.conv3(y) # bs,dim,h,w y = torch.cat([x, y], 1) # bs,2*dim,h,w y = self.conv4(y) # bs,c,h,w return y if __name__ == '__main__': m = MobileViTAttention(in_channel=512) input = torch.randn(1, 512, 49, 49) output = m(input) print(output.shape) ================================================ FILE: cv-attention/ParNetAttention.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init class ParNetAttention(nn.Module): def __init__(self, channel=512): super().__init__() self.sse = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Conv2d(channel, channel, kernel_size=1), nn.Sigmoid() ) self.conv1x1 = nn.Sequential( nn.Conv2d(channel, channel, kernel_size=1), nn.BatchNorm2d(channel) ) self.conv3x3 = nn.Sequential( nn.Conv2d(channel, channel, kernel_size=3, padding=1), nn.BatchNorm2d(channel) ) self.silu = nn.SiLU() def forward(self, x): b, c, _, _ = x.size() x1 = self.conv1x1(x) x2 = self.conv3x3(x) x3 = self.sse(x) * x y = self.silu(x1 + x2 + x3) return y if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) pna = ParNetAttention(channel=512) output = pna(input) print(output.shape) ================================================ FILE: cv-attention/PolarizedSelfAttention.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init class ParallelPolarizedSelfAttention(nn.Module): def __init__(self, channel=512): super().__init__() self.ch_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1)) self.ch_wq=nn.Conv2d(channel,1,kernel_size=(1,1)) self.softmax_channel=nn.Softmax(1) self.softmax_spatial=nn.Softmax(-1) self.ch_wz=nn.Conv2d(channel//2,channel,kernel_size=(1,1)) self.ln=nn.LayerNorm(channel) self.sigmoid=nn.Sigmoid() self.sp_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1)) self.sp_wq=nn.Conv2d(channel,channel//2,kernel_size=(1,1)) self.agp=nn.AdaptiveAvgPool2d((1,1)) def forward(self, x): b, c, h, w = x.size() #Channel-only Self-Attention channel_wv=self.ch_wv(x) #bs,c//2,h,w channel_wq=self.ch_wq(x) #bs,1,h,w channel_wv=channel_wv.reshape(b,c//2,-1) #bs,c//2,h*w channel_wq=channel_wq.reshape(b,-1,1) #bs,h*w,1 channel_wq=self.softmax_channel(channel_wq) channel_wz=torch.matmul(channel_wv,channel_wq).unsqueeze(-1) #bs,c//2,1,1 channel_weight=self.sigmoid(self.ln(self.ch_wz(channel_wz).reshape(b,c,1).permute(0,2,1))).permute(0,2,1).reshape(b,c,1,1) #bs,c,1,1 channel_out=channel_weight*x #Spatial-only Self-Attention spatial_wv=self.sp_wv(x) #bs,c//2,h,w spatial_wq=self.sp_wq(x) #bs,c//2,h,w spatial_wq=self.agp(spatial_wq) #bs,c//2,1,1 spatial_wv=spatial_wv.reshape(b,c//2,-1) #bs,c//2,h*w spatial_wq=spatial_wq.permute(0,2,3,1).reshape(b,1,c//2) #bs,1,c//2 spatial_wq=self.softmax_spatial(spatial_wq) spatial_wz=torch.matmul(spatial_wq,spatial_wv) #bs,1,h*w spatial_weight=self.sigmoid(spatial_wz.reshape(b,1,h,w)) #bs,1,h,w spatial_out=spatial_weight*x out=spatial_out+channel_out return out if __name__ == '__main__': input=torch.randn(1,512,7,7) psa = ParallelPolarizedSelfAttention(channel=512) output=psa(input) print(output.shape) ================================================ FILE: cv-attention/S2Attention.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init def spatial_shift1(x): b, w, h, c = x.size() x[:, 1:, :, :c // 4] = x[:, :w - 1, :, :c // 4] x[:, :w - 1, :, c // 4:c // 2] = x[:, 1:, :, c // 4:c // 2] x[:, :, 1:, c // 2:c * 3 // 4] = x[:, :, :h - 1, c // 2:c * 3 // 4] x[:, :, :h - 1, 3 * c // 4:] = x[:, :, 1:, 3 * c // 4:] return x def spatial_shift2(x): b, w, h, c = x.size() x[:, :, 1:, :c // 4] = x[:, :, :h - 1, :c // 4] x[:, :, :h - 1, c // 4:c // 2] = x[:, :, 1:, c // 4:c // 2] x[:, 1:, :, c // 2:c * 3 // 4] = x[:, :w - 1, :, c // 2:c * 3 // 4] x[:, :w - 1, :, 3 * c // 4:] = x[:, 1:, :, 3 * c // 4:] return x class SplitAttention(nn.Module): def __init__(self, channel=512, k=3): super().__init__() self.channel = channel self.k = k self.mlp1 = nn.Linear(channel, channel, bias=False) self.gelu = nn.GELU() self.mlp2 = nn.Linear(channel, channel * k, bias=False) self.softmax = nn.Softmax(1) def forward(self, x_all): b, k, h, w, c = x_all.shape x_all = x_all.reshape(b, k, -1, c) # bs,k,n,c a = torch.sum(torch.sum(x_all, 1), 1) # bs,c hat_a = self.mlp2(self.gelu(self.mlp1(a))) # bs,kc hat_a = hat_a.reshape(b, self.k, c) # bs,k,c bar_a = self.softmax(hat_a) # bs,k,c attention = bar_a.unsqueeze(-2) # #bs,k,1,c out = attention * x_all # #bs,k,n,c out = torch.sum(out, 1).reshape(b, h, w, c) return out class S2Attention(nn.Module): def __init__(self, channels=512): super().__init__() self.mlp1 = nn.Linear(channels, channels * 3) self.mlp2 = nn.Linear(channels, channels) self.split_attention = SplitAttention() def forward(self, x): b, c, w, h = x.size() x = x.permute(0, 2, 3, 1) x = self.mlp1(x) x1 = spatial_shift1(x[:, :, :, :c]) x2 = spatial_shift2(x[:, :, :, c:c * 2]) x3 = x[:, :, :, c * 2:] x_all = torch.stack([x1, x2, x3], 1) a = self.split_attention(x_all) x = self.mlp2(a) x = x.permute(0, 3, 1, 2) return x if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) s2att = S2Attention(channels=512) output = s2att(input) print(output.shape) ================================================ FILE: cv-attention/SE.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init class SEAttention(nn.Module): def __init__(self, channel=512,reduction=16): super().__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction, bias=False), nn.ReLU(inplace=True), nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid() ) def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x): b, c, _, _ = x.size() y = self.avg_pool(x).view(b, c) y = self.fc(y).view(b, c, 1, 1) return x * y.expand_as(x) if __name__ == '__main__': input=torch.randn(50,512,7,7) se = SEAttention(channel=512,reduction=8) output=se(input) print(output.shape) ================================================ FILE: cv-attention/SGE.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init class SpatialGroupEnhance(nn.Module): def __init__(self, groups=8): super().__init__() self.groups=groups self.avg_pool = nn.AdaptiveAvgPool2d(1) self.weight=nn.Parameter(torch.zeros(1,groups,1,1)) self.bias=nn.Parameter(torch.zeros(1,groups,1,1)) self.sig=nn.Sigmoid() self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x): b, c, h,w=x.shape x=x.view(b*self.groups,-1,h,w) #bs*g,dim//g,h,w xn=x*self.avg_pool(x) #bs*g,dim//g,h,w xn=xn.sum(dim=1,keepdim=True) #bs*g,1,h,w t=xn.view(b*self.groups,-1) #bs*g,h*w t=t-t.mean(dim=1,keepdim=True) #bs*g,h*w std=t.std(dim=1,keepdim=True)+1e-5 t=t/std #bs*g,h*w t=t.view(b,self.groups,h,w) #bs,g,h*w t=t*self.weight+self.bias #bs,g,h*w t=t.view(b*self.groups,1,h,w) #bs*g,1,h*w x=x*self.sig(t) x=x.view(b,c,h,w) return x if __name__ == '__main__': input=torch.randn(50,512,7,7) sge = SpatialGroupEnhance(groups=8) output=sge(input) print(output.shape) ================================================ FILE: cv-attention/SK.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init from collections import OrderedDict class SKAttention(nn.Module): def __init__(self, channel=512, kernels=[1, 3, 5, 7], reduction=16, group=1, L=32): super().__init__() self.d = max(L, channel // reduction) self.convs = nn.ModuleList([]) for k in kernels: self.convs.append( nn.Sequential(OrderedDict([ ('conv', nn.Conv2d(channel, channel, kernel_size=k, padding=k // 2, groups=group)), ('bn', nn.BatchNorm2d(channel)), ('relu', nn.ReLU()) ])) ) self.fc = nn.Linear(channel, self.d) self.fcs = nn.ModuleList([]) for i in range(len(kernels)): self.fcs.append(nn.Linear(self.d, channel)) self.softmax = nn.Softmax(dim=0) def forward(self, x): bs, c, _, _ = x.size() conv_outs = [] ### split for conv in self.convs: conv_outs.append(conv(x)) feats = torch.stack(conv_outs, 0) # k,bs,channel,h,w ### fuse U = sum(conv_outs) # bs,c,h,w ### reduction channel S = U.mean(-1).mean(-1) # bs,c Z = self.fc(S) # bs,d ### calculate attention weight weights = [] for fc in self.fcs: weight = fc(Z) weights.append(weight.view(bs, c, 1, 1)) # bs,channel attention_weughts = torch.stack(weights, 0) # k,bs,channel,1,1 attention_weughts = self.softmax(attention_weughts) # k,bs,channel,1,1 ### fuse V = (attention_weughts * feats).sum(0) return V if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) se = SKAttention(channel=512, reduction=8) output = se(input) print(output.shape) ================================================ FILE: cv-attention/SequentialSelfAttention.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init class SequentialPolarizedSelfAttention(nn.Module): def __init__(self, channel=512): super().__init__() self.ch_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1)) self.ch_wq=nn.Conv2d(channel,1,kernel_size=(1,1)) self.softmax_channel=nn.Softmax(1) self.softmax_spatial=nn.Softmax(-1) self.ch_wz=nn.Conv2d(channel//2,channel,kernel_size=(1,1)) self.ln=nn.LayerNorm(channel) self.sigmoid=nn.Sigmoid() self.sp_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1)) self.sp_wq=nn.Conv2d(channel,channel//2,kernel_size=(1,1)) self.agp=nn.AdaptiveAvgPool2d((1,1)) def forward(self, x): b, c, h, w = x.size() #Channel-only Self-Attention channel_wv=self.ch_wv(x) #bs,c//2,h,w channel_wq=self.ch_wq(x) #bs,1,h,w channel_wv=channel_wv.reshape(b,c//2,-1) #bs,c//2,h*w channel_wq=channel_wq.reshape(b,-1,1) #bs,h*w,1 channel_wq=self.softmax_channel(channel_wq) channel_wz=torch.matmul(channel_wv,channel_wq).unsqueeze(-1) #bs,c//2,1,1 channel_weight=self.sigmoid(self.ln(self.ch_wz(channel_wz).reshape(b,c,1).permute(0,2,1))).permute(0,2,1).reshape(b,c,1,1) #bs,c,1,1 channel_out=channel_weight*x #Spatial-only Self-Attention spatial_wv=self.sp_wv(channel_out) #bs,c//2,h,w spatial_wq=self.sp_wq(channel_out) #bs,c//2,h,w spatial_wq=self.agp(spatial_wq) #bs,c//2,1,1 spatial_wv=spatial_wv.reshape(b,c//2,-1) #bs,c//2,h*w spatial_wq=spatial_wq.permute(0,2,3,1).reshape(b,1,c//2) #bs,1,c//2 spatial_wq=self.softmax_spatial(spatial_wq) spatial_wz=torch.matmul(spatial_wq,spatial_wv) #bs,1,h*w spatial_weight=self.sigmoid(spatial_wz.reshape(b,1,h,w)) #bs,1,h,w spatial_out=spatial_weight*channel_out return spatial_out if __name__ == '__main__': input=torch.randn(1,512,7,7) psa = SequentialPolarizedSelfAttention(channel=512) output=psa(input) print(output.shape) ================================================ FILE: cv-attention/ShuffleAttention.py ================================================ import numpy as np import torch from torch import nn from torch.nn import init from torch.nn.parameter import Parameter class ShuffleAttention(nn.Module): def __init__(self, channel=512, reduction=16, G=8): super().__init__() self.G = G self.channel = channel self.avg_pool = nn.AdaptiveAvgPool2d(1) self.gn = nn.GroupNorm(channel // (2 * G), channel // (2 * G)) self.cweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1)) self.cbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1)) self.sweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1)) self.sbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1)) self.sigmoid = nn.Sigmoid() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.001) if m.bias is not None: init.constant_(m.bias, 0) @staticmethod def channel_shuffle(x, groups): b, c, h, w = x.shape x = x.reshape(b, groups, -1, h, w) x = x.permute(0, 2, 1, 3, 4) # flatten x = x.reshape(b, -1, h, w) return x def forward(self, x): b, c, h, w = x.size() # group into subfeatures x = x.view(b * self.G, -1, h, w) # bs*G,c//G,h,w # channel_split x_0, x_1 = x.chunk(2, dim=1) # bs*G,c//(2*G),h,w # channel attention x_channel = self.avg_pool(x_0) # bs*G,c//(2*G),1,1 x_channel = self.cweight * x_channel + self.cbias # bs*G,c//(2*G),1,1 x_channel = x_0 * self.sigmoid(x_channel) # spatial attention x_spatial = self.gn(x_1) # bs*G,c//(2*G),h,w x_spatial = self.sweight * x_spatial + self.sbias # bs*G,c//(2*G),h,w x_spatial = x_1 * self.sigmoid(x_spatial) # bs*G,c//(2*G),h,w # concatenate along channel axis out = torch.cat([x_channel, x_spatial], dim=1) # bs*G,c//G,h,w out = out.contiguous().view(b, -1, h, w) # channel shuffle out = self.channel_shuffle(out, 2) return out if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) se = ShuffleAttention(channel=512, G=8) output = se(input) print(output.shape) ================================================ FILE: cv-attention/SimAM.py ================================================ import torch import torch.nn as nn class SimAM(torch.nn.Module): def __init__(self, e_lambda=1e-4): super(SimAM, self).__init__() self.activaton = nn.Sigmoid() self.e_lambda = e_lambda def __repr__(self): s = self.__class__.__name__ + '(' s += ('lambda=%f)' % self.e_lambda) return s @staticmethod def get_module_name(): return "simam" def forward(self, x): b, c, h, w = x.size() n = w * h - 1 x_minus_mu_square = (x - x.mean(dim=[2, 3], keepdim=True)).pow(2) y = x_minus_mu_square / (4 * (x_minus_mu_square.sum(dim=[2, 3], keepdim=True) / n + self.e_lambda)) + 0.5 return x * self.activaton(y) if __name__ == '__main__': input = torch.randn(3, 64, 7, 7) model = SimAM() outputs = model(input) print(outputs.shape) ================================================ FILE: cv-attention/TripletAttention.py ================================================ import torch import torch.nn as nn class BasicConv(nn.Module): def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): super(BasicConv, self).__init__() self.out_channels = out_planes self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None self.relu = nn.ReLU() if relu else None def forward(self, x): x = self.conv(x) if self.bn is not None: x = self.bn(x) if self.relu is not None: x = self.relu(x) return x class ZPool(nn.Module): def forward(self, x): return torch.cat((torch.max(x, 1)[0].unsqueeze(1), torch.mean(x, 1).unsqueeze(1)), dim=1) class AttentionGate(nn.Module): def __init__(self): super(AttentionGate, self).__init__() kernel_size = 7 self.compress = ZPool() self.conv = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size - 1) // 2, relu=False) def forward(self, x): x_compress = self.compress(x) x_out = self.conv(x_compress) scale = torch.sigmoid_(x_out) return x * scale class TripletAttention(nn.Module): def __init__(self, no_spatial=False): super(TripletAttention, self).__init__() self.cw = AttentionGate() self.hc = AttentionGate() self.no_spatial = no_spatial if not no_spatial: self.hw = AttentionGate() def forward(self, x): x_perm1 = x.permute(0, 2, 1, 3).contiguous() x_out1 = self.cw(x_perm1) x_out11 = x_out1.permute(0, 2, 1, 3).contiguous() x_perm2 = x.permute(0, 3, 2, 1).contiguous() x_out2 = self.hc(x_perm2) x_out21 = x_out2.permute(0, 3, 2, 1).contiguous() if not self.no_spatial: x_out = self.hw(x) x_out = 1 / 3 * (x_out + x_out11 + x_out21) else: x_out = 1 / 2 * (x_out11 + x_out21) return x_out if __name__ == '__main__': input = torch.randn(50, 512, 7, 7) triplet = TripletAttention() output = triplet(input) print(output.shape) ================================================ FILE: cv-attention/readme.md ================================================ # CV-Attention 关于CV的一些经典注意力机制代码。 目前代码格式主要用于yolov3,yolov5,yolov7,yolov8. # Supports | name | need_chaneel | paper | | :----:| :----: | :----: | | BAM | True | https://arxiv.org/pdf/1807.06514.pdf | | CBAM | True | https://openaccess.thecvf.com/content_ECCV_2018/papers/Sanghyun_Woo_Convolutional_Block_Attention_ECCV_2018_paper.pdf | | SE | True | https://arxiv.org/abs/1709.01507 | | CoTAttention | True | https://arxiv.org/abs/2107.12292 | | MobileViTAttention | True | https://arxiv.org/abs/2110.02178 | | SimAM | False | http://proceedings.mlr.press/v139/yang21o/yang21o.pdf | | SK | True | https://arxiv.org/pdf/1903.06586.pdf | | ShuffleAttention | True | https://arxiv.org/pdf/2102.00240.pdf | | S2Attention | True | https://arxiv.org/abs/2108.01072 | | TripletAttention | False | https://arxiv.org/abs/2010.03045 | | ECA | True | https://arxiv.org/pdf/1910.03151.pdf | | ParNetAttention | True | https://arxiv.org/abs/2110.07641 | | CoordAttention | True | https://arxiv.org/abs/2103.02907 | | MHSA
Multi-Head-Self-Attention | True | https://wuch15.github.io/paper/EMNLP2019-NRMS.pdf | | SGE | False | https://arxiv.org/pdf/1905.09646.pdf | | A2Attention | True | https://arxiv.org/pdf/1810.11579.pdf | | GC
Global Context Attention | True | https://arxiv.org/abs/1904.11492 | | EffectiveSE
Effective Squeeze-Excitation | True | https://arxiv.org/abs/1911.06667 | | GE
Gather-Excite Attention | True | https://arxiv.org/abs/1810.12348 | | CrissCrossAttention | True | https://arxiv.org/abs/1811.11721 | | Polarized Self-Attention | True | https://arxiv.org/abs/2107.00782 | | Sequential Self-Attention | True | https://arxiv.org/abs/2107.00782 | | GAM | True | https://arxiv.org/pdf/2112.05561v1.pdf | | Biformer | True | https://arxiv.org/abs/2303.08810 | | EMA | True | https://arxiv.org/abs/2305.13563v2 | | CloAttention | True | https://arxiv.org/abs/2303.17803 | | LSKBlock | True | https://arxiv.org/pdf/2303.09030.pdf | | MLCA | True | https://www.sciencedirect.com/science/article/pii/S0952197623006267 | | LSKA | True | https://arxiv.org/abs/2309.01439 | | DAttention | True | https://openaccess.thecvf.com/content/CVPR2022/html/Xia_Vision_Transformer_With_Deformable_Attention_CVPR_2022_paper.html | | ELA | True | https://arxiv.org/abs/2403.01123 | | CAA | True | https://arxiv.org/pdf/2403.06258 | | CPCA | True | https://arxiv.org/abs/2306.05196 | # Install 安装命令:pip install timm einops efficientnet_pytorch -i https://pypi.tuna.tsinghua.edu.cn/simple # Course 1. [yolov5添加注意力哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1s84y1775U) [yolov5添加注意力-补充事项-哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1hG4y1M71X) 2. [yolov7添加注意力哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1pd4y1H7BK) 3. [yolov8添加注意力哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1ZQ4y1J7oC/) [yolov8添加注意力进阶版哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1ZQ4y1J7oC/) # Reference https://github.com/xmu-xiaoma666/External-Attention-pytorch https://github.com/rwightman/pytorch-image-models https://github.com/rayleizhu/BiFormer https://github.com/XiaLiPKU/EMANet https://github.com/qhfan/CloFormer/tree/main https://github.com/zcablii/LSKNet https://github.com/wandahangFY/MLCA https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention https://github.com/LeapLabTHU/DAT https://github.com/NUST-Machine-Intelligence-Laboratory/PKINet https://github.com/Cuthbert-Huang/CPCANet ================================================ FILE: cvpr2025-deim-project.md ================================================ # 2025-SOTA目标检测模型项目(2026发论文必备项目) 鉴于目前YOLO系列模型反映的拒稿率越来越高且YOLO模型确实非常泛滥,无论是不是计算机专业、是不是小白都基本可以快速上手YOLO模型,导致计算机专业和有期刊级别要求的小伙伴日益难受,简单来说就是YOLO在学术界的红利已经基本吃透,目前开始越来越多人转CVPR2024-RTDETR,而且目前研究生毕业一年比一年难,不像以前随便结合点深度学习就可以毕业,就像越来越多人反馈,导师已经明确禁止不能用YOLO,再加上这么多年来YOLO对学术的灌水已经让审稿人出现视觉疲劳,带上了”有色”眼镜看待YOLO,所以结合以上众多原因,因此我们需要一个有一定上手难度且是顶会的模型来支撑我们后续的大小论文的工作。 PS:20250614版本更新后,本项目的dfine和cvpr2025-deimv1已经支持Ultralytics同款的配置文件形式,大大降低上手难度![B站介绍链接](https://www.bilibili.com/video/BV1Q4MHzXEdd/) ### 1. 这个项目包含什么模型? 这个项目的源代码来自:[DEIM](https://github.com/ShihuaHuang95/DEIM) 其内部可以跑以下模型(以下模型支持目标检测,DFine、DEIM支持实例分割,不支持姿态检测、旋转目标检测): 1. CVPR2025-DEIM 2. ICLR2025-DFine 3. RTDETRV2 4. DEIMV2 选择这个课程,这些模型都可以改进,不限于DEIM,这些都是顶会的模型,不要说2025,就算是2026、2027都不落后!还有一个重点就是像CVPR2024-RTDETR,最小的模型也有50GFLOPs,但是现在的DEIM和DFine都有像YOLO一样的Nano大小版本的模型,变相降低了训练成本和设备要求!(建议最低12G显存的显卡起步) ### 2. 这个项目会以什么形式开展? 1. 这个项目跟以往区别比较大,我们其他改进项目都是直接提供好修改好的代码,用户不需要懂代码的情况下也可以开始做实验,甚至可以做完实验,但是这样也有一个不好的点,就是会大幅度降低上手门槛,这特别对计算机专业的同学来说是非常不利的,因此这个项目在代码工程方面,这个项目我们会有教程教大家怎么去调试程序、修改代码、添加模块。 2. 这个项目会**不定时(直播时间到时候会群里进行通知,没有硬性规定多久一次,不方便看的会有录播)**有**直播**,详细直播内容请看第三大点。 3. 这个项目会持续更新创新点,如果创新点是来源于现有的模型,还会提供对应的论文及其中文翻译版本(假设像FasterNet中的FasterBlock,会提供好对应的py文件、原论文及其中文翻译版本),用户可以根据从本课程学习到的缝合模块(代指第一点)去定制或者创新自己的网络。 4. 附带答疑群,答疑群主要答疑的内容是实验、代码操作、代码报错等相关问题(经过YOLO、RTDETR大量的经验,我没法保证每一个问题都能回复到大家,只能保证遇到过的问题会给大家提供建议和方向,当然群内的一些高频问题,我也会收集起来挑出部分出视频或者直播给大家进行解答)。 5. 如果后续有剪枝、蒸馏,不需要额外付费,本项目会包含在内,所以性价比真的非常高,YOLO改进剪枝蒸馏三件套也要200多了。 ### 3. 直播内容 1. 解答群内一些高频疑问,比如很多人都会遇到的报错、或者注意点。 2. 教大家如何去做二次创新(PS:这个不是口头给大家说怎么二次创新,而是从代码的层面带大家去实践二次创新。可能这里会有同学问,那自研创新呢?你会自研模块的前提是必须要懂如何二次创新,首先这是一个过程,然后我有很多自研模块是突然有的想法或者看论文看到某些结构与之前看到的论文联合后有新的想法,所以也很难描述我为什么就想到这个结构,大多数情况下,只需要会有一定复杂度的二次创新就足够,当然自研模块有机会我也会去讲) 3. 给大家从浅到深解说一些我认为比较经典的模块,提高自己能创新新模块的能力和基础,因为很多模块都是相通的,本质没有变,只是模块上的组合体替换。(有不少人私聊我说,能不能出些你是如何结合一些现有的模块去创新的,虽然现在B站上也有不少讲创新点的,但是他们的感觉就是从头到尾读一篇代码,我看了几次之后觉得我把代码扔给GPT给我打上注释的感觉是一样的,看的时候感觉哦哦哦这样,看完后就不知所然) ### 3. 入手本项目需要注意些什么? 1. 因为本项目完全不是像之前YOLO项目这样傻瓜式操作,所以本项目有一定难度,具有以下特征的小伙伴不建议入手。(看到这里可能有人会问,为什么不考虑把DEIM、DFine、RTDETRV2都移植到Ultralytics?因为这个不确定性太大,DETR类型的模型对参数非常敏感,可能有一点参数不合适,效果就会大打折扣,但是对于这种较为复杂的模型移植过程中又很难保证一比一全过程移植) - 未入门、100%纯小白(如果你有心学,这个不是问题) - 不太想花太多时间去学,搞这个只是想为了水个无要求的论文就行 - 没有任何解决问题的能力(如果你有心学,这个不是问题) - 从来不看使用文档、说明之类的(强烈不建议入手) - 此项目上手需要时间,如果想无脑直接跑就不合适购入 最后补充!如果你具有以上特征,但又要求期刊不能太水或者不能做yolo的问题,尽早入手CVPR2024-RTDETR吧,去年没抓上,今年不能再等了,模型红利可不等人。 2. 入手前可以先去B站看一下[CVPR025-DEIM合集里面的教程](https://space.bilibili.com/286900343/lists/4909499),最起码先跑通过DEIM原始模型,能跟着视频训练和测试,然后也把合集里面的基础课程都先看一下,为后面打好基础。 3. 我认为这个不是什么不可达到的事,就看你想不想毕业了,有志者事竟成。 PS:20250614版本更新后,本项目的dfine和deim已经支持Ultralytics同款的配置文件形式,大大降低上手难度![B站介绍链接](https://www.bilibili.com/video/BV1Q4MHzXEdd/) ### 4. 价格 1. 本项目价格为288,没有时效限制。(与其150、200买个YOLO纯模型改进专栏,不如288买个2025-SOTA专栏,最起码不用怕花了钱,最后做的YOLO还投不出去,还毕不了业) 2. 虚拟项目一经售出不退不换,需要入手前考虑清楚,如果你是初次入手我的项目,怕我不靠谱,可以先考虑入手个YOLO和RTDETR看下。 ### 5. 项目使用问题 1. 购买本项目的使用者都会得到一个独一无二的用于解压7z的密码,到时候用于解压对应的压缩包,此密码自己妥善保管,请勿告诉他人。 2. 本项目的视频和直播回放统一都是加密视频,每个购买者都可以得到一个激活码,激活码在每个人专属的7z压缩文件内。 ### 6. 项目更新公告 - 20250330 1. 初版项目发布. - 20250413 1. 新增多个改进模块并新增模块简介,位置在engine/extre_module/module_images内。 2. 新增训练和测试阶段的进度条显示。 3. 优化tensorboard中的精度名称显示。 4. 优化输出,把重要信息换颜色显示。 5. 新增plot_train_batch_freq参数,用于控制间隔多少epoch保存第一个batch中的数据增强后的图像,默认为12。 6. 新增保存当前参数信息,会自动保存到output_dir中的args.json文件内。 7. 优化output_dir保存逻辑,当判断output_dir路径存在的时候,会自动在后缀加1,避免覆盖原先代码。 - 20250419 1. 新增verbose_type参数,用于控制使用默认还是进度条输出,默认为官方默认输出形式。 2. 新增thop计算模型计算量方式,避免calflops对于部分算子出现不支持报错的操作。 3. 完善每个模块的py文件,增加输出计算量和参数量等数值,方便用户后续调试。 4. 给DataLoader中添加pin_memory参数为True,可以在训练时候如果是数据加载成为瓶颈,可以提高速度。 5. 修复用户反馈的已知问题。 6. 新增多个改进模块。 - 20250429 1. 修复engine/extre_module/custom_nn/attention/SEAM.py模块,应该是MutilSEAM。 2. 新增一些进阶课程的视频。 3. 新增多个改进模块。 4. 修复用户反馈的已知问题。 5. 修复续训时候会新增一个保存路径的问题。 6. 修复多卡训练Stage2的时候会出现部分进程找不到权重文件的问题。 - 20250514 1. 新增一些进阶课程的视频。 2. 新增多个改进模块。 3. 修复用户反馈的已知问题。 - 20250526 1. 新增一些进阶课程的视频。 2. 新增多个改进模块。 3. 新增cache_ram参数,详细可以看userguide。 4. 修复在torch2.7.0下出现的NotImplementedError问题。 - 20250609 1. 修复新增了cache_ram功能后训练COCO数据集精度不正常的问题。 2. 修复在训练COCO数据集中数据增强的绘制BUG。 3. 新增多个改进模块。 4. 新增一些进阶课程的视频。 5. 修复用户反馈的已知问题。 - 20250614 1. 新增Ultralytics的配置文件方式,大大降低改进难度。 2. 新增一些进阶课程的视频。 3. 新增多个改进模块。 - 20250617 1. 修复配置文件中层序号有误的问题。 - 20250619 1. 修复配置文件中层序号有误的问题。 2. 新增多个改进模块。 3. 新增一些进阶课程的视频。 - 20250625 1. 修复best_stg2保存异常的问题。 2. 新增YOLOV13中的HyperACE模块。 3. 新增多个关于进阶课程的视频。 - 20250705 1. 新增多个改进模块。 2. 新增多个关于进阶课程的视频。 3. 新增20250704基础疑问解答直播回放链接。 - 20250714 1. 新增多个改进模块。 2. 新增多个关于进阶课程的视频。 3. 新增小目标检测网络架构专题一群课题直播回放。 - 20250726 1. 新增在test-only的状态下输出每个类别的'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'。 2. 新增多个改进模块。 3. 修复用户反馈的已知问题。 4. 新增一个JSON格式数据集脚本。(输出类别数和类别id、输出每个类别的实例数量) - 20250817 1. 新增支持蒸馏学习,蒸馏学习支持断点续训使用方法跟正常训练一样。 2. 蒸馏学习支持特征蒸馏、逻辑蒸馏、特征+逻辑蒸馏 这三种方式。 3. 无论是Ultralytics配置文件方式、还是原始的代码方式都支持相互蒸馏。 4. 蒸馏学习支持控制epoch,例如只有前50epoch进行蒸馏学习,后50epoch关闭蒸馏学习。 5. 更多细节请看关于<知识蒸馏教学视频>的进阶课程。 6. 支持输出YOLO指标(Precision、Recall、F1-Score、mAP50、mAP75、mAP50-95),详细请看userguide。 7. 新增多个改进模块。 8. 新增小目标检测网络架构专题二链接。 - 20250823 1. 修复YOLO指标在一些图片没真实标签的时候报错的bug。 2. 开放逻辑蒸馏,在项目内有对应的课程。 3. 新增多个改进模块。 4. 新增<知识蒸馏教学视频>的进阶课程。 - 20250907 1. 新增多个改进模块。 2. 修复蒸馏学习中教师信息输出错误的问题。 - 20250921 1. 新增导出脚本(export.py),支持导出onnx、tensorrt模型。 2. 重构大部分输出,增加输出对应的时间、文件、函数、行数,以便用户快速定位。 3. 新增20250910直播回放链接。 4. 修复一些已知BUG。 5. 完善onnx、tensorrt模型推理脚本。 6. 支持在train.py test-only状态下中使用onnx、tensorrt模型进行验证。 7. 新增<模型导出>相关教程视频。 8. 新增多个改进模块。 9. 支持DINOV3(ConvNext、ViT)作为主干进行微调。<教程在百度云创新课题的第五点> - 20251012 1. 移植DEIMV2到本项目,暂只支持原始的代码修改方式。 2. 更新UserGuide。 3. 新增。 4. 修复一些已知问题。 - 20251025 1. 新增DQ-DETR的模块。 2. 新增多个改进模块。 3. 新增的相关教程视频。 4. 修复一些已知问题。 - 20251102 1. 新增的相关教程视频。 2. 修复一些已知问题。 - 20251115 1. 新增以DensityMap为主导的创新课程[DFINE with Density-aware Query Selection]。 2. 修复一些已知问题。 - 20251207 1. 新增在test-only状态下,yolo-metrice支持保存混淆矩阵。 2. 新增DFine、DEIM实例分割的实现,使用相关请看进阶教程实例分割部分。 3. 更新dataset/coco_analyzer.py脚本,支持输出数据集中更多的内容,以便分析数据集的特点。 4. 新增tools/visualization/tp_fp_fn_analysis.py脚本,用于分析检测结果中的tp、fp、fn。 5. 新增多个改进模块。 6. 修复一些已知问题。 7. 新增。 8. 新增基于ByteTrack的目标跟踪,教程请看进阶教程内的<目标跟踪ByteTrack的使用教程>。 - 20251213 1. 参考CVPR2022-MaskDINO重构实例分割检测头代码。 2. 修复在ram_cache状态下实例分割数据集部分存在的BUG。 3. 重新录制实例分割部分的进阶视频。 - 20251224 1. 新增多个改进模块。 2. 修复实例分割部分已知的问题。 3. 新增以DensityMap为主导的实例分割检测头内容[DFINESeg with Density-aware Query Selection]。 4. 新增[DFINESeg with Density-aware Query Selection]的使用视频教程。 5. 更新实例分割实现讲解。 - 20251226 1. 修复一些已知问题。 2. 新增基于COCO-Tiny指标,并支持输出每类COCO-Tiny指标,详细请看UserGuide.md中的<项目内yml一些额外参数说明>。 - 20260109 1. 修复一些已知问题。 2. 新增动态路由网络模块。 3. 更新视频链接。 - 20260128 1. 修复一些已知问题。 2. 新增多个改进模块。 3. 新增动态路由网络教程视频。 4. 新增的MSBlock和GQL的教程视频。 - 20260224 1. 修复一些已知问题。 2. 新增多个改进模块。 3. compile_module的编译模块支持50系显卡。 4. 为了兼容50系用户,新版的环境统一修改成torch2.8.0,旧版本的用户不影响。 - 20260310 1. 新增diou, ciou, eiou, siou, shapeiou, piou, piou2。 2. 支持TIMM中的主干进行训练。 3. DINOV3版本支持Ultralytics版本训练。 4. 新增AAAI2026-SPJFB模块。 5. 新增TGRS2025-GLSS2D模块。 6. 新增TIP2025-CAFM模块。 7. 新增TIP2025-DWM_MSA模块。 8. 新增DynamicERF模块。 9. 新增如何使用其他IOU的操作视频。 10. 新增TIMM主干的操作视频。 11. yolo_metrice参数从默认为False改为True,代表训练过程中YOLO和COCO指标都会一并输出。 ### 7. 目前已有的模块 - engine/extre_module/custom_nn/attention 1. engine/extre_module/custom_nn/attention/SEAM.py 2. CVPR2021|engine/extre_module/custom_nn/attention/ca.py 3. ICASSP2023|engine/extre_module/custom_nn/attention/ema.py 4. ICML2021|engine/extre_module/custom_nn/attention/simam.py 5. ICCV2023|engine/extre_module/custom_nn/attention/lsk.py 6. WACV2024|engine/extre_module/custom_nn/attention/DeformableLKA.py 7. engine/extre_module/custom_nn/attention/mlca.py 8. BIBM2024|engine/extre_module/custom_nn/attention/FSA.py 9. AAAI2025|engine/extre_module/custom_nn/attention/CDFA.py 10. engine/extre_module/custom_nn/attention/GLSA.py 11. TGRS2025|engine/extre_module/custom_nn/attention/MCA.py 12. CVPR2025|engine/extre_module/custom_nn/attention/CASAB.py 13. NN2025|engine/extre_module/custom_nn/attention/KSFA.py 14. TPAMI2025|engine/extre_module/custom_nn/attention/GQL.py 15. TGRS2025|engine/extre_module/custom_nn/attention/ACA.py 16. TGRS2025|engine/extre_module/custom_nn/attention/DHPF.py 17. TGRS2025|engine/extre_module/custom_nn/attention/ACAB.py - engine/extre_module/custom_nn/block 1. engine/extre_module/custom_nn/block/RepHMS.py 2. 自研模块|engine/extre_module/custom_nn/block/rgcspelan.py 3. TPAMI2025|engine/extre_module/custom_nn/block/MANet.py - engine/extre_module/custom_nn/conv_module 1. CVPR2021|engine/extre_module/custom_nn/conv_module/dbb.py 2. IEEETIP2024|engine/extre_module/custom_nn/conv_module/deconv.py 3. ICCV2023|engine/extre_module/custom_nn/conv_module/dynamic_snake_conv.py 4. CVPR2023|engine/extre_module/custom_nn/conv_module/pconv.py 5. AAAI2025|engine/extre_module/custom_nn/conv_module/psconv.py 6. CVPR2025|engine/extre_module/custom_nn/conv_module/ShiftwiseConv.py 7. engine/extre_module/custom_nn/conv_module/wdbb.py 8. engine/extre_module/custom_nn/conv_module/deepdbb.py 9. ECCV2024|engine/extre_module/custom_nn/conv_module/wtconv2d.py 10. CVPR2023|engine/extre_module/custom_nn/conv_module/ScConv.py 11. engine/extre_module/custom_nn/conv_module/dcnv2.py 12. CVPR2024|engine/extre_module/custom_nn/conv_module/DilatedReparamConv.py 13. engine/extre_module/custom_nn/conv_module/gConv.py 14. CVPR2024|engine/extre_module/custom_nn/conv_module/IDWC.py 15. engine/extre_module/custom_nn/conv_module/DSA.py 16. CVPR2025|engine/extre_module/custom_nn/conv_module/FDConv.py 17. CVPR2023|engine/extre_module/custom_nn/conv_module/dcnv3.py 18. CVPR2024|engine/extre_module/custom_nn/conv_module/dcnv4.py 19. CVPR2024|engine/extre_module/custom_nn/conv_module/DynamicConv.py 20. CVPR2024|engine/extre_module/custom_nn/conv_module/FADC.py 21. CVPR2023|engine/extre_module/custom_nn/conv_module/SMPConv.py 22. MIA2025|engine/extre_module/custom_nn/conv_module/FourierConv.py 23. CVPR2024|engine/extre_module/custom_nn/conv_module/SFSConv.py 24. ICCV2025|engine/extre_module/custom_nn/conv_module/MBRConv.py 25. ICCV2025|engine/extre_module/custom_nn/conv_module/ConvAttn.py 26. ICCV2025|engine/extre_module/custom_nn/conv_module/Converse2D.py 27. CVPR2025|engine/extre_module/custom_nn/conv_module/gcconv.py 28. ACCV2024|engine/extre_module/custom_nn/conv_module/RMBC.py - engine/extre_module/custom_nn/upsample 1. CVPR2024|engine/extre_module/custom_nn/upsample/eucb.py 2. CVPR2024|engine/extre_module/custom_nn/upsample/eucb_sc.py 3. engine/extre_module/custom_nn/upsample/WaveletUnPool.py 4. ICCV2019|engine/extre_module/custom_nn/upsample/CARAFE.py 5. ICCV2023|engine/extre_module/custom_nn/upsample/DySample.py 6. ICCV2025|engine/extre_module/custom_nn/upsample/Converse2D_Up.py 7. CVPR2025|engine/extre_module/custom_nn/upsample/DSUB.py - engine/extre_module/custom_nn/downsample 1. IEEETIP2020|engine/extre_module/custom_nn/downsample/gcnet.py 2. 自研模块|engine/extre_module/custom_nn/downsample/lawds.py 3. engine/extre_module/custom_nn/downsample/WaveletPool.py 4. engine/extre_module/custom_nn/downsample/ADown.py 5. engine/extre_module/custom_nn/downsample/YOLOV7Down.py 6. engine/extre_module/custom_nn/downsample/SPDConv.py 7. engine/extre_module/custom_nn/downsample/HWD.py 8. engine/extre_module/custom_nn/downsample/DRFD.py 9. TGRS2025|engine/extre_module/custom_nn/conv_module/FSConv.py - engine/extre_module/custom_nn/stem 1. engine/extre_module/custom_nn/stem/SRFD.py 2. engine/extre_module/custom_nn/stem/LoG.py 3. ICCV2023|engine/extre_module/custom_nn/stem/RepStem.py - engine/extre_module/custom_nn/featurefusion 1. 自研模块|engine/extre_module/custom_nn/featurefusion/cgfm.py 2. BMVC2024|engine/extre_module/custom_nn/featurefusion/msga.py 3. CVPR2024|engine/extre_module/custom_nn/featurefusion/mfm.py 4. IEEETIP2023|engine/extre_module/custom_nn/featurefusion/CSFCN.py 5. BIBM2024|engine/extre_module/custom_nn/featurefusion/mpca.py 6. ACMMM2024|engine/extre_module/custom_nn/featurefusion/wfu.py 7. CVPR2025|engine/extre_module/custom_nn/featurefusion/GDSAFusion.py 8. engine/extre_module/custom_nn/featurefusion/PST.py 9. TGRS2025|engine/extre_module/custom_nn/featurefusion/MSAM.py 10. INFFUS2025|engine/extre_module/custom_nn/featurefusion/DPCF.py 11. CVRP2025|engine/extre_module/custom_nn/featurefusion/LCA.py 12. TGRS2025|engine/extre_module/custom_nn/featurefusion/HFFE.py 13. TGRS2025|engine/extre_module/custom_nn/featurefusion/MFPM.py 14. TGRS2025|engine/extre_module/custom_nn/featurefusion/ERM.py 15. TIP2025|engine/extre_module/custom_nn/featurefusion/CAFM.py - engine/extre_module/custom_nn/module 1. AAAI2025|engine/extre_module/custom_nn/module/APBottleneck.py 2. CVPR2025|engine/extre_module/custom_nn/module/efficientVIM.py 3. CVPR2023|engine/extre_module/custom_nn/module/fasterblock.py 4. CVPR2024|engine/extre_module/custom_nn/module/starblock.py 5. engine/extre_module/custom_nn/module/DWR.py 6. CVPR2024|engine/extre_module/custom_nn/module/UniRepLKBlock.py 7. CVPR2025|engine/extre_module/custom_nn/module/mambaout.py 8. AAAI2024|engine/extre_module/custom_nn/module/DynamicFilter.py 9. engine/extre_module/custom_nn/module/StripBlock.py 10. TGRS2024|engine/extre_module/custom_nn/module/elgca.py 11. CVPR2024|engine/extre_module/custom_nn/module/LEGM.py 12. ICCV2023|engine/extre_module/custom_nn/module/iRMB.py 13. TPAMI2025|engine/extre_module/custom_nn/module/MSBlock.py 14. ICLR2024|engine/extre_module/custom_nn/module/FATBlock.py 15. CVPR2024|engine/extre_module/custom_nn/module/MSCB.py 16. engine/extre_module/custom_nn/module/LEGBlock.py 17. CVPR2025|engine/extre_module/custom_nn/module/RCB.py 18. ECCV2024|engine/extre_module/custom_nn/module/JDPM.py 19. CVPR2025|engine/extre_module/custom_nn/module/vHeat.py 20. CVPR2025|engine/extre_module/custom_nn/module/EBlock.py 21. CVPR2025|engine/extre_module/custom_nn/module/DBlock.py 22. ECCV2024|engine/extre_module/custom_nn/module/FMB.py 23. CVPR2024|engine/extre_module/custom_nn/module/IDWB.py 24. ECCV2022|engine/extre_module/custom_nn/module/LFE.py 25. AAAI2025|engine/extre_module/custom_nn/module/FCM.py 26. CVPR2024|engine/extre_module/custom_nn/module/RepViTBlock.py 27. CVPR2024|engine/extre_module/custom_nn/module/PKIModule.py 28. CVPR2024|engine/extre_module/custom_nn/module/camixer.py 29. ICCV2025|engine/extre_module/custom_nn/module/ESC.py 30. CVPR2025|engine/extre_module/custom_nn/module/nnWNet.py 31. TGRS2025|engine/extre_module/custom_nn/module/ARF.py 32. AAAI2024|engine/extre_module/custom_nn/module/CFBlock.py 33. IJCV2024|engine/extre_module/custom_nn/module/FMA.py 34. engine/extre_module/custom_nn/module/LWGA.py 35. TGRS2025|engine/extre_module/custom_nn/module/CSSC.py 36. TGRS2025|engine/extre_module/custom_nn/module/CNCM.py 37. ICCV2025|engine/extre_module/custom_nn/module/HFRB.py 38. ICIP2025|engine/extre_module/custom_nn/module/EVA.py 39. CVPR2025|engine/extre_module/custom_nn/module/IEL.py 40. MICCAI2023|engine/extre_module/custom_nn/module/MFEBlock.py 41. AAAI2026|engine/extre_module/custom_nn/module/PartialNetBlock.py 42. TGRS2025|engine/extre_module/custom_nn/module/DRG.py 43. engine/extre_module/custom_nn/module/Wave2D.py 44. TGRS2025|engine/extre_module/custom_nn/module/GLGM.py 45. TGRS2025|engine/extre_module/custom_nn/module/MAC.py 46. AAAI2026|engine/extre_module/custom_nn/module/SPJFB.py - engine/extre_module/custom_nn/neck 1. 自研模块|engine/extre_module/custom_nn/neck/FDPN.py - engine/extre_module/custom_nn/neck_module 1. TPAMI2025|engine/extre_module/custom_nn/neck_module/HyperCompute.py 2. engine/extre_module/custom_nn/neck_module/HyperACE.py 3. engine/extre_module/custom_nn/neck_module/GoldYOLO.py 4. AAAI2025|engine/extre_module/custom_nn/neck_module/HS_FPN.py - engine/extre_module/custom_nn/norm 1. ICML2024|engine/extre_module/custom_nn/transformer/repbn.py 2. CVPR2025|engine/extre_module/custom_nn/transformer/dyt.py 3. engine/extre_module/custom_nn/norm/derf.py - engine/extre_module/custom_nn/transformer 1. ICLR2025|engine/extre_module/custom_nn/transformer/PolaLinearAttention.py 2. CVPR2023|engine/extre_module/custom_nn/transformer/biformer.py 3. CVPR2023|engine/extre_module/custom_nn/transformer/CascadedGroupAttention.py 4. CVPR2022|engine/extre_module/custom_nn/transformer/DAttention.py 5. ICLR2022|engine/extre_module/custom_nn/transformer/DPBAttention.py 6. CVPR2024|engine/extre_module/custom_nn/transformer/AdaptiveSparseSA.py 7. engine/extre_module/custom_nn/transformer/GSA.py 8. engine/extre_module/custom_nn/transformer/RSA.py 9. ECCV2024|engine/extre_module/custom_nn/transformer/FSSA.py 10. AAAI2025|engine/extre_module/custom_nn/transformer/DilatedGCSA.py 11. AAAI2025|engine/extre_module/custom_nn/transformer/DilatedMWSA.py 12. CVPR2024|engine/extre_module/custom_nn/transformer/SHSA.py 13. IJCAI2024|engine/extre_module/custom_nn/transformer/CTA.py 14. IJCAI2024|engine/extre_module/custom_nn/transformer/SFA.py 15. engine/extre_module/custom_nn/transformer/MSLA.py 16. ACMMM2025|engine/extre_module/custom_nn/transformer/CPIA_SA.py 17. NN2025|engine/extre_module/custom_nn/transformer/TokenSelectAttention.py 18. CVPR2025|engine/extre_module/custom_nn/transformer/TAB.py 19. TPAMI2025|engine/extre_module/custom_nn/transformer/LRSA.py 20. ICCV2025|engine/extre_module/custom_nn/transformer/MALA.py 21. ICML2023|engine/extre_module/custom_nn/transformer/MUA.py 22. ACMMM2025|engine/extre_module/custom_nn/transformer/EGSA.py 23. ACMMM2025|engine/extre_module/custom_nn/transformer/SWSA.py 24. AAAI2026|engine/extre_module/custom_nn/transformer/DHOGSA.py 25. NeurIPS2025|engine/extre_module/custom_nn/transformer/CBSA.py 26. TGRS2025|engine/extre_module/custom_nn/transformer/DPWA.py 27. TIP2025|engine/extre_module/custom_nn/transformer/DWM_MSA.py - engine/extre_module/custom_nn/mlp 1. CVPR2024|engine/extre_module/custom_nn/mlp/ConvolutionalGLU.py 2. IJCAI2024|engine/extre_module/custom_nn/mlp/DFFN.py 3. ICLR2024|engine/extre_module/custom_nn/mlp/FMFFN.py 4. CVPR2024|engine/extre_module/custom_nn/mlp/FRFN.py 5. ECCV2024|engine/extre_module/custom_nn/mlp/EFFN.py 6. WACV2025|engine/extre_module/custom_nn/mlp/SEFN.py 7. ICLR2025|engine/extre_module/custom_nn/mlp/KAN.py 8. CVPR2025|engine/extre_module/custom_nn/mlp/EDFFN.py 9. ICVJ2024|engine/extre_module/custom_nn/mlp/DML.py 10. AAAI2026|engine/extre_module/custom_nn/mlp/DIFF.py - engine/extre_module/custom_nn/mamba 1. AAAI2025|engine/extre_module/custom_nn/mamba/SS2D.py 2. CVPR2025|engine/extre_module/custom_nn/mamba/ASSM.py 3. CVPR2025|engine/extre_module/custom_nn/mamba/SAVSS.py 4. CVPR2025|engine/extre_module/custom_nn/mamba/MobileMamba/mobilemamba.py 5. CVPR2025|engine/extre_module/custom_nn/mamba/MaIR.py 6. TGRS2025|engine/extre_module/custom_nn/mamba/GLVSS.py 7. ICCV2025|engine/extre_module/custom_nn/mamba/VSSD.py 8. ICCV2025|engine/extre_module/custom_nn/mamba/TinyViM.py 9. INFFUS2025|engine/extre_module/custom_nn/mamba/CSI.py 10. TIP2025|engine/extre_module/custom_nn/mamba/SFMB.py 11. TGRS2025|engine/extre_module/custom_nn/mamba/GLSS.py 12. TGRS2025|engine/extre_module/custom_nn/mamba/GLSS2D.py - engine/extre_module/custom_nn/moe 1. engine/extre_module/custom_nn/moe/moe_module.py - engine/extre_module/custom_nn/featurepreprocess 1. TGRS2025|engine/extre_module/custom_nn/featurepreprocess/FAENet.py - 积木模块,示例教程engine/extre_module/custom_nn/module/example.py 1. YOLOV5|C3 2. YOLOV8|C2f 3. YOLO11|C3k2 4. TPAMI2025|MANet 5. TPAMI2024|MetaFormer_Block 6. TPAMI2024+CVPR2025|MetaFormer_Mona 7. TPAMI2024+CVPR2025+WACV2025|MetaFormer_SEFN 8. TPAMI2024+CVPR2025+WACV2025|MetaFormer_Mona_SEFN - 创新课程代码<标识着是那个课程中的代码,详细可以去看对应的课程视频> 1. 顶会中的Partial创新思想课程|engine/extre_module/innovate/CVPR2020_GhostConv.py 2. 顶会中的Partial创新思想课程|engine/extre_module/innovate/CVPR2023_PartialConv.py 3. CVPR2025-MobileMamba中的Long-Range WTB-Mamba二次创新|engine/extre_module/innovate/CVPR2025_MobileMamba.py 4. TGRS2025-HighFrequencyDirectionInjection创新思想课程|engine/extre_module/innovate/TGRS2025_HFDI.py ================================================ FILE: damo-yolo/Annotations/ReadMe.md ================================================ # 存放VOC标注格式的文件夹 ================================================ FILE: damo-yolo/JPEGImages/ReadMe.md ================================================ # 存放图像的文件夹 ================================================ FILE: damo-yolo/readme.md ================================================ # DAMO-YOLO的数据集处理文件 本目录下的脚本是针对与DAMO-YOLO的数据集处理脚本,支持如下: 1. VOC标注格式转换为COCO标注格式,并生成train.json,val.json,test.json. # 使用方法 1. 把图片存放在JPEGImages中,图片后缀需要一致,比如都是jpg或者png等等,不支持混合的图片后缀格式,比如一些是jpg,一些是png。 2. 把VOC标注格式的XML文件存放在Annotations中。 3. 运行voc2coco.py,其中postfix参数是JPEGImages的图片后缀,train_ratio是训练集的比例,val_ratio是验证集的比例,剩下的就是测试集的比例。 ================================================ FILE: damo-yolo/voc2coco.py ================================================ import os import glob import json import shutil import numpy as np import xml.etree.ElementTree as ET START_BOUNDING_BOX_ID = 1 def find_classes(path): classes = [] for i in os.listdir(path): try: in_file = open(os.path.join(path, i), encoding='utf-8') tree=ET.parse(in_file) root = tree.getroot() for obj in root.iter('object'): difficult = 0 if obj.find('difficult')!=None: difficult = obj.find('difficult').text cls = obj.find('name').text if cls not in classes: classes.append(cls) except Exception as e: print(os.path.join(path, i), e) return classes def get(root, name): return root.findall(name) def get_and_check(root, name, length): vars = root.findall(name) if len(vars) == 0: raise NotImplementedError('Can not find %s in %s.'%(name, root.tag)) if length > 0 and len(vars) != length: raise NotImplementedError('The size of %s is supposed to be %d, but is %d.'%(name, length, len(vars))) if length == 1: vars = vars[0] return vars def convert(xml_list, json_file): json_dict = {"info":['none'], "license":['none'], "images": [], "annotations": [], "categories": []} categories = pre_define_categories.copy() bnd_id = START_BOUNDING_BOX_ID all_categories = {} for index, line in enumerate(xml_list): # print("Processing %s"%(line)) xml_f = line tree = ET.parse(xml_f) root = tree.getroot() filename = os.path.basename(xml_f)[:-4] + f".{postfix}" image_id = index size = get_and_check(root, 'size', 1) width = int(get_and_check(size, 'width', 1).text) height = int(get_and_check(size, 'height', 1).text) image = {'file_name': filename, 'height': height, 'width': width, 'id':image_id} json_dict['images'].append(image) ## Cruuently we do not support segmentation # segmented = get_and_check(root, 'segmented', 1).text # assert segmented == '0' for obj in get(root, 'object'): category = get_and_check(obj, 'name', 1).text if category in all_categories: all_categories[category] += 1 else: all_categories[category] = 1 if category not in categories: if only_care_pre_define_categories: continue new_id = len(categories) + 1 print("[warning] category '{}' not in 'pre_define_categories'({}), create new id: {} automatically".format(category, pre_define_categories, new_id)) categories[category] = new_id category_id = categories[category] bndbox = get_and_check(obj, 'bndbox', 1) xmin = int(float(get_and_check(bndbox, 'xmin', 1).text)) ymin = int(float(get_and_check(bndbox, 'ymin', 1).text)) xmax = int(float(get_and_check(bndbox, 'xmax', 1).text)) ymax = int(float(get_and_check(bndbox, 'ymax', 1).text)) # if (xmax > xmin) or (ymax > ymin): # continue # assert(xmax > xmin), "xmax <= xmin, {}".format(line) # assert(ymax > ymin), "ymax <= ymin, {}".format(line) o_width = abs(xmax - xmin) o_height = abs(ymax - ymin) ann = {'area': o_width*o_height, 'iscrowd': 0, 'image_id': image_id, 'bbox':[xmin, ymin, o_width, o_height], 'category_id': category_id, 'id': bnd_id, 'ignore': 0, 'segmentation': []} json_dict['annotations'].append(ann) bnd_id = bnd_id + 1 for cate, cid in categories.items(): cat = {'supercategory': 'none', 'id': cid, 'name': cate} json_dict['categories'].append(cat) json_fp = open(json_file, 'w') json_str = json.dumps(json_dict) json_fp.write(json_str) json_fp.close() print("------------create {} done--------------".format(json_file)) print("find {} categories: {} -->>> your pre_define_categories {}: {}".format(len(all_categories), all_categories.keys(), len(pre_define_categories), pre_define_categories.keys())) print("category: id --> {}".format(categories)) print(categories.keys()) print(categories.values()) if __name__ == '__main__': postfix = 'jpg' # xml标注文件夹 xml_dir = './datasets/Annotations' # 训练数据的josn文件 save_json_train = './datasets/train.json' # 验证数据的josn文件 save_json_val = './datasets/val.json' # 验证数据的test文件 save_json_test = './datasets/test.json' # 类别,如果是多个类别,往classes中添加类别名字即可,比如['dog', 'person', 'cat'] classes = [] # 是否需要先遍历全部xml文件寻找classes get_data_classes = True # 是否只关注classes里面的类别 only_care_pre_define_categories = False if get_data_classes: classes = find_classes(xml_dir) only_care_pre_define_categories = False pre_define_categories = {} for i, cls in enumerate(classes): pre_define_categories[cls] = i + 1 print(pre_define_categories) # 训练数据集比例 train_ratio = 0.7 val_ratio = 0.1 print('xml_dir is {}'.format(xml_dir)) xml_list = glob.glob(xml_dir + "/*.xml") xml_list = np.sort(xml_list) # print('xml_list is {}'.format(xml_list)) np.random.seed(100) np.random.shuffle(xml_list) train_num = int(len(xml_list)*train_ratio) val_num = int(len(xml_list)*val_ratio) print('训练样本数目是 {}'.format(train_num)) print('验证样本数目是 {}'.format(val_num)) print('测试样本数目是 {}'.format(len(xml_list) - train_num - val_num)) xml_list_val = xml_list[:val_num] xml_list_train = xml_list[val_num:train_num+val_num] xml_list_test = xml_list[train_num+val_num:] # 对训练数据集对应的xml进行coco转换 convert(xml_list_train, save_json_train) # 对验证数据集的xml进行coco转换 convert(xml_list_val, save_json_val) # 对测试数据集的xml进行coco转换 convert(xml_list_test, save_json_test) ================================================ FILE: data-offline-aug/object_detection_data_aug.py ================================================ import warnings warnings.filterwarnings('ignore') import os, shutil, cv2, tqdm import numpy as np import albumentations as A from PIL import Image from multiprocessing import Pool from typing import Callable, Dict, List, Union # https://github.com/albumentations-team/albumentations # https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#geometric-transforms-augmentationsgeometrictransforms:~:text=Contributing%20to%20Albumentations-,Geometric%20transforms%20(augmentations.geometric.transforms),-%C2%B6 IMAGE_PATH = 'dataset/object_detection/images' LABEL_PATH = 'dataset/object_detection/labels' AUG_IMAGE_PATH = 'dataset/object_detection/images_aug' AUG_LABEL_PATH = 'dataset/object_detection/labels_aug' SHOW_SAVE_PATH = 'results' CLASSES = ['head', 'person'] ENHANCEMENT_LOOP = 1 ENHANCEMENT_STRATEGY = A.Compose([ A.Compose([ A.Affine(scale=[0.5, 1.5], translate_percent=[0.0, 0.3], rotate=[-360, 360], shear=[-45, 45], keep_ratio=True, p=0.5), # Augmentation to apply affine transformations to images. A.BBoxSafeRandomCrop(erosion_rate=0.2, p=0.1), # Crop a random part of the input without loss of bboxes. A.D4(p=0.1), # Applies one of the eight possible D4 dihedral group transformations to a square-shaped input, maintaining the square shape. These transformations correspond to the symmetries of a square, including rotations and reflections. A.ElasticTransform(p=0.1), # Elastic deformation of images as described in [Simard2003]_ (with modifications). A.Flip(p=0.1), # Flip the input either horizontally, vertically or both horizontally and vertically. A.GridDistortion(p=0.1), # Applies grid distortion augmentation to images, masks, and bounding boxes. This technique involves dividing the image into a grid of cells and randomly displacing the intersection points of the grid, resulting in localized distortions. A.Perspective(p=0.1), # Perform a random four point perspective transform of the input. ], p=1.0), A.Compose([ A.GaussNoise(p=0.1), # Apply Gaussian noise to the input image. A.ISONoise(p=0.1), # Apply camera sensor noise. A.ImageCompression(quality_lower=50, quality_upper=100, p=0.1), # Decreases image quality by Jpeg, WebP compression of an image. A.RandomBrightnessContrast(p=0.1), # Randomly change brightness and contrast of the input image. A.RandomFog(p=0.1), # Simulates fog for the image. A.RandomRain(p=0.1), # Adds rain effects to an image. A.RandomSnow(p=0.1), # Bleach out some pixel values imitating snow. A.RandomShadow(p=0.1), # Simulates shadows for the image A.RandomSunFlare(p=0.1), # Simulates Sun Flare for the image A.ToGray(p=0.1), # Convert the input RGB image to grayscale ], p=1.0) # A.OneOf([ # A.GaussNoise(p=1.0), # Apply Gaussian noise to the input image. # A.ISONoise(p=1.0), # Apply camera sensor noise. # A.ImageCompression(quality_lower=50, quality_upper=100, p=1.0), # Decreases image quality by Jpeg, WebP compression of an image. # A.RandomBrightnessContrast(p=1.0), # Randomly change brightness and contrast of the input image. # A.RandomFog(p=1.0), # Simulates fog for the image. # A.RandomRain(p=1.0), # Adds rain effects to an image. # A.RandomSnow(p=1.0), # Bleach out some pixel values imitating snow. # A.RandomShadow(p=1.0), # Simulates shadows for the image # A.RandomSunFlare(p=1.0), # Simulates Sun Flare for the image # A.ToGray(p=1.0), # Convert the input RGB image to grayscale # ], p=1.0), ], bbox_params=A.BboxParams(format='yolo', min_visibility=0.1, label_fields=['class_labels'])) def parallelise(function: Callable, data: List, chunksize=100, verbose=True, num_workers=os.cpu_count()) -> List: num_workers = 1 if num_workers < 1 else num_workers # Pool needs to have at least 1 worker. pool = Pool(processes=num_workers) results = list( tqdm.tqdm(pool.imap(function, data, chunksize), total=len(data), disable=not verbose) ) pool.close() pool.join() return results def draw_detections(box, name, img): height, width, _ = img.shape xmin, ymin, xmax, ymax = list(map(int, list(box))) # 根据图像大小调整矩形框的线宽和文本的大小 line_thickness = max(1, int(min(height, width) / 200)) font_scale = min(height, width) / 500 font_thickness = max(1, int(min(height, width) / 200)) # 根据图像大小调整文本的纵向位置 text_offset_y = int(min(height, width) / 50) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), line_thickness) cv2.putText(img, str(name), (xmin, ymin - text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), font_thickness, lineType=cv2.LINE_AA) return img def show_labels(images_base_path, labels_base_path): if os.path.exists(SHOW_SAVE_PATH): shutil.rmtree(SHOW_SAVE_PATH) os.makedirs(SHOW_SAVE_PATH, exist_ok=True) for images_name in tqdm.tqdm(os.listdir(images_base_path)): file_heads, _ = os.path.splitext(images_name) # images_path = f'{images_base_path}/{images_name}' images_path = os.path.join(images_base_path, images_name) # labels_path = f'{labels_base_path}/{file_heads}.txt' labels_path = os.path.join(labels_base_path, f'{file_heads}.txt') if os.path.exists(labels_path): with open(labels_path) as f: labels = np.array(list(map(lambda x:np.array(x.strip().split(), dtype=np.float64), f.readlines())), dtype=np.float64) images = cv2.imread(images_path) height, width, _ = images.shape for cls, x_center, y_center, w, h in labels: x_center *= width y_center *= height w *= width h *= height draw_detections([x_center - w // 2, y_center - h // 2, x_center + w // 2, y_center + h // 2], CLASSES[int(cls)], images) # cv2.imwrite(f'{SHOW_SAVE_PATH}/{images_name}', images) cv2.imwrite(os.path.join(SHOW_SAVE_PATH, images_name), images) print(f'{SHOW_SAVE_PATH}/{images_name} save success...') else: print(f'{labels_path} label file not found...') def data_aug_single(images_name): file_heads, postfix = os.path.splitext(images_name) # images_path = f'{IMAGE_PATH}/{images_name}' images_path = os.path.join(IMAGE_PATH, images_name) # labels_path = f'{LABEL_PATH}/{file_heads}.txt' labels_path = os.path.join(LABEL_PATH, f'{file_heads}.txt') if os.path.exists(labels_path): with open(labels_path) as f: labels = np.array(list(map(lambda x:np.array(x.strip().split(), dtype=np.float64), f.readlines())), dtype=np.float64) images = Image.open(images_path) for i in range(ENHANCEMENT_LOOP): # new_images_name = f'{AUG_IMAGE_PATH}/{file_heads}_{i:0>3}{postfix}' new_images_name = os.path.join(AUG_IMAGE_PATH, f'{file_heads}_{i:0>3}{postfix}') # new_labels_name = f'{AUG_LABEL_PATH}/{file_heads}_{i:0>3}.txt' new_labels_name = os.path.join(AUG_LABEL_PATH, f'{file_heads}_{i:0>3}.txt') try: transformed = ENHANCEMENT_STRATEGY(image=np.array(images), bboxes=np.minimum(np.maximum(labels[:, 1:], 0), 1), class_labels=labels[:, 0]) except: continue transformed_image = transformed['image'] transformed_bboxes = transformed['bboxes'] transformed_class_labels = transformed['class_labels'] cv2.imwrite(new_images_name, cv2.cvtColor(transformed_image, cv2.COLOR_RGB2BGR)) with open(new_labels_name, 'w+') as f: for bbox, cls in zip(transformed_bboxes, transformed_class_labels): f.write(f'{cls} {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}\n') print(f'{new_images_name} and {new_labels_name} save success...') else: print(f'{labels_path} label file not found...') def data_aug(): if os.path.exists(AUG_IMAGE_PATH): shutil.rmtree(AUG_IMAGE_PATH) if os.path.exists(AUG_LABEL_PATH): shutil.rmtree(AUG_LABEL_PATH) os.makedirs(AUG_IMAGE_PATH, exist_ok=True) os.makedirs(AUG_LABEL_PATH, exist_ok=True) for images_name in tqdm.tqdm(os.listdir(IMAGE_PATH)): data_aug_single(images_name) if __name__ == '__main__': # data_aug() # show_labels(IMAGE_PATH, LABEL_PATH) show_labels(AUG_IMAGE_PATH, AUG_LABEL_PATH) ================================================ FILE: data-offline-aug/readme.md ================================================ # data-offline-aug ### 环境 pip install -i https://pypi.tuna.tsinghua.edu.cn/simple albumentations ### 1. object_detection_data_aug.py 目标检测数据集yolo格式离线数据增强脚本. 视频教程链接:https://www.bilibili.com/video/BV1bT421k7iq/ ### 2. segment_data_aug.py 语义分割离线数据增强脚本. 视频教程链接:https://www.bilibili.com/video/BV1xi421a7Gb/ # Reference https://github.com/albumentations-team/albumentations ================================================ FILE: data-offline-aug/segment_data_aug.py ================================================ import warnings warnings.filterwarnings('ignore') import os, shutil, cv2, tqdm import numpy as np np.random.seed(0) import albumentations as A from PIL import Image from multiprocessing import Pool from typing import Callable, Dict, List, Union # https://github.com/albumentations-team/albumentations def generate_color_map(num_classes): hsv_colors = [(i * 180 // num_classes, 255, 255) for i in range(num_classes)] rgb_colors = [[0, 0, 0]] + [cv2.cvtColor(np.uint8([[color]]), cv2.COLOR_HSV2BGR)[0][0] for color in hsv_colors] return np.array(rgb_colors, dtype=np.uint8) IMAGE_PATH = 'dataset/segment/images' LABEL_PATH = 'dataset/segment/labels' AUG_IMAGE_PATH = 'dataset/segment/images_aug' AUG_LABEL_PATH = 'dataset/segment/labels_aug' SHOW_SAVE_PATH = 'results' COLORS = generate_color_map(20) ENHANCEMENT_LOOP = 1 ENHANCEMENT_STRATEGY = A.Compose([ A.Compose([ A.Affine(scale=[0.5, 1.5], translate_percent=[0.0, 0.3], rotate=[-360, 360], shear=[-45, 45], keep_ratio=True, cval_mask=0, p=0.5), # Augmentation to apply affine transformations to images. A.BBoxSafeRandomCrop(erosion_rate=0.2, p=0.1), # Crop a random part of the input without loss of bboxes. A.D4(p=0.1), # Applies one of the eight possible D4 dihedral group transformations to a square-shaped input, maintaining the square shape. These transformations correspond to the symmetries of a square, including rotations and reflections. A.ElasticTransform(p=0.1), # Elastic deformation of images as described in [Simard2003]_ (with modifications). A.Flip(p=0.1), # Flip the input either horizontally, vertically or both horizontally and vertically. A.GridDistortion(p=0.1), # Applies grid distortion augmentation to images, masks, and bounding boxes. This technique involves dividing the image into a grid of cells and randomly displacing the intersection points of the grid, resulting in localized distortions. A.Perspective(p=0.1), # Perform a random four point perspective transform of the input. ], p=1.0), A.Compose([ A.GaussNoise(p=0.1), # Apply Gaussian noise to the input image. A.ISONoise(p=0.1), # Apply camera sensor noise. A.ImageCompression(quality_lower=50, quality_upper=100, p=0.1), # Decreases image quality by Jpeg, WebP compression of an image. A.RandomBrightnessContrast(p=0.1), # Randomly change brightness and contrast of the input image. A.RandomFog(p=0.1), # Simulates fog for the image. A.RandomRain(p=0.1), # Adds rain effects to an image. A.RandomSnow(p=0.1), # Bleach out some pixel values imitating snow. A.RandomShadow(p=0.1), # Simulates shadows for the image A.RandomSunFlare(p=0.1), # Simulates Sun Flare for the image A.ToGray(p=0.1), # Convert the input RGB image to grayscale ], p=1.0) # A.OneOf([ # A.GaussNoise(p=1.0), # Apply Gaussian noise to the input image. # A.ISONoise(p=1.0), # Apply camera sensor noise. # A.ImageCompression(quality_lower=50, quality_upper=100, p=1.0), # Decreases image quality by Jpeg, WebP compression of an image. # A.RandomBrightnessContrast(p=1.0), # Randomly change brightness and contrast of the input image. # A.RandomFog(p=1.0), # Simulates fog for the image. # A.RandomRain(p=1.0), # Adds rain effects to an image. # A.RandomSnow(p=1.0), # Bleach out some pixel values imitating snow. # A.RandomShadow(p=1.0), # Simulates shadows for the image # A.RandomSunFlare(p=1.0), # Simulates Sun Flare for the image # A.ToGray(p=1.0), # Convert the input RGB image to grayscale # ], p=1.0), ], is_check_shapes=False) def draw_segments(image, mask): blended_image = cv2.addWeighted(image, 0.7, COLORS[mask], 0.3, 0) return blended_image def show_labels(images_base_path, labels_base_path): if os.path.exists(SHOW_SAVE_PATH): shutil.rmtree(SHOW_SAVE_PATH) os.makedirs(SHOW_SAVE_PATH, exist_ok=True) for images_name in tqdm.tqdm(os.listdir(images_base_path)): file_heads, _ = os.path.splitext(images_name) # images_path = f'{images_base_path}/{images_name}' images_path = os.path.join(images_base_path, images_name) # labels_path = f'{labels_base_path}/{file_heads}.png' labels_path = os.path.join(labels_base_path, f'{file_heads}.png') if os.path.exists(labels_path): images = cv2.imread(images_path) masks = np.array(Image.open(labels_path)) print(np.unique(masks)) images = draw_segments(images, masks) cv2.imwrite(f'{SHOW_SAVE_PATH}/{images_name}', images) print(f'{SHOW_SAVE_PATH}/{images_name} save success...') else: print(f'{labels_path} label file not found...') def data_aug_single(images_name): file_heads, postfix = os.path.splitext(images_name) # images_path = f'{IMAGE_PATH}/{images_name}' images_path = os.path.join(IMAGE_PATH, images_name) # labels_path = f'{LABEL_PATH}/{file_heads}.jpg' labels_path = os.path.join(LABEL_PATH, f'{file_heads}.jpg') if os.path.exists(labels_path): images = Image.open(images_path) masks = np.array(Image.open(labels_path)) for i in range(ENHANCEMENT_LOOP): # new_images_name = f'{AUG_IMAGE_PATH}/{file_heads}_{i:0>3}{postfix}' new_images_name = os.path.join(AUG_IMAGE_PATH, f'{file_heads}_{i:0>3}{postfix}') # new_labels_name = f'{AUG_LABEL_PATH}/{file_heads}_{i:0>3}.png' new_labels_name = os.path.join(AUG_LABEL_PATH, f'{file_heads}_{i:0>3}.png') try: transformed = ENHANCEMENT_STRATEGY(image=np.array(images), masks=[masks]) except: continue transformed_image = transformed['image'] transformed_masks = transformed['masks'][0] cv2.imwrite(new_images_name, cv2.cvtColor(transformed_image, cv2.COLOR_RGB2BGR)) Image.fromarray(np.array(transformed_masks)).save(new_labels_name) print(f'{new_images_name} and {new_labels_name} save success...') else: print(f'{labels_path} label file not found...') def data_aug(): if os.path.exists(AUG_IMAGE_PATH): shutil.rmtree(AUG_IMAGE_PATH) if os.path.exists(AUG_LABEL_PATH): shutil.rmtree(AUG_LABEL_PATH) os.makedirs(AUG_IMAGE_PATH, exist_ok=True) os.makedirs(AUG_LABEL_PATH, exist_ok=True) for images_name in tqdm.tqdm(os.listdir(IMAGE_PATH)): data_aug_single(images_name) if __name__ == '__main__': show_labels(IMAGE_PATH, LABEL_PATH) # show_labels(AUG_IMAGE_PATH, AUG_LABEL_PATH) # data_aug() ================================================ FILE: mmdet-course/config/atss_r50_fpn_dyhead_1x_visdrone.py ================================================ _base_ = 'atss_r50_fpn_dyhead_1x_coco.py' model = dict( bbox_head=dict( num_classes=10 ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth' # nohup python tools/train.py configs/dyhead/atss_r50_fpn_dyhead_1x_visdrone.py > atss-dyhead-visdrone.log 2>&1 & tail -f atss-dyhead-visdrone.log # python tools/test.py configs/dyhead/atss_r50_fpn_dyhead_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/dyhead/atss_r50_fpn_dyhead_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --tta ================================================ FILE: mmdet-course/config/cascade-rcnn_r50_fpn_1x_visdrone.py ================================================ _base_ = './cascade-rcnn_r50_fpn_1x_coco.py' # 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 model = dict( roi_head=dict( bbox_head=[ dict( type='Shared2FCBBoxHead', num_classes=10 ), dict( type='Shared2FCBBoxHead', num_classes=10 ), dict( type='Shared2FCBBoxHead', num_classes=10 ), ] ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth' # nohup python tools/train.py configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_visdrone.py > cascade-rcnn-visdrone.log 2>&1 & tail -f cascade-rcnn-visdrone.log # python tools/test.py configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_visdrone.py work_dirs/cascade-rcnn_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_visdrone.py work_dirs/cascade-rcnn_r50_fpn_1x_visdrone/epoch_12.pth --tta ================================================ FILE: mmdet-course/config/ddq-detr-4scale_r50_8xb2-12e_visdrone.py ================================================ _base_ = 'ddq-detr-4scale_r50_8xb2-12e_coco.py' model = dict( bbox_head=dict( type='DDQDETRHead', num_classes=10 ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=2, num_workers=2, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=2, num_workers=2, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=2, num_workers=2, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=1000)) load_from='ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth' # nohup python tools/train.py configs/ddq/ddq-detr-4scale_r50_8xb2-12e_visdrone.py > ddq-visdrone.log 2>&1 & tail -f ddq-visdrone.log # python tools/test.py configs/ddq/ddq-detr-4scale_r50_8xb2-12e_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/ddq/ddq-detr-4scale_r50_8xb2-12e_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --tta ================================================ FILE: mmdet-course/config/dino-4scale_r50_8xb2-12e_visdrone.py ================================================ _base_ = 'dino-4scale_r50_8xb2-12e_coco.py' model = dict( bbox_head=dict( type='DINOHead', num_classes=10, ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=4, num_workers=4, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=4, num_workers=4, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=4, num_workers=4, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=500)) load_from='dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth' # nohup python tools/train.py configs/dino/dino-4scale_r50_8xb2-12e_visdrone.py > dino-visdrone.log 2>&1 & tail -f dino-visdrone.log # python tools/test.py configs/dino/dino-4scale_r50_8xb2-12e_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/dino/dino-4scale_r50_8xb2-12e_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --tta ================================================ FILE: mmdet-course/config/faster-rcnn_r50_fpn_ciou_1x_visdrone.py ================================================ _base_ = 'faster-rcnn_r50_fpn_ciou_1x_coco.py' # 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 model = dict( roi_head=dict( bbox_head=dict( type='Shared2FCBBoxHead', num_classes=10 ) ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth' # nohup python tools/train.py configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_visdrone.py > faster-rcnn-visdrone.log 2>&1 & tail -f faster-rcnn-visdrone.log # python tools/test.py configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --tta ================================================ FILE: mmdet-course/config/gfl_r50_fpn_1x_visdrone.py ================================================ _base_ = 'gfl_r50_fpn_1x_coco.py' # 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 model = dict( bbox_head=dict( num_classes=10 ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth' # nohup python tools/train.py configs/gfl/gfl_r50_fpn_1x_visdrone.py > gfl-visdrone.log 2>&1 & tail -f gfl-visdrone.log # python tools/test.py configs/gfl/gfl_r50_fpn_1x_visdrone.py work_dirs/gfl_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/gfl/gfl_r50_fpn_1x_visdrone.py work_dirs/gfl_r50_fpn_1x_visdrone/epoch_12.pth --tta # python tools/analysis_tools/get_flops.py configs/gfl/gfl_r50_fpn_1x_visdrone.py ================================================ FILE: mmdet-course/config/retinanet_r50_fpn_1x_visdrone.py ================================================ _base_ = 'retinanet_r50_fpn_1x_coco.py' # 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 model = dict( bbox_head=dict( num_classes=10 ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' # nohup python tools/train.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py > retinanet-visdrone.log 2>&1 & tail -f retinanet-visdrone.log # python tools/test.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py work_dirs/retinanet_r50_fpn_1x_visdrone/epoch_12.pth --tta # python tools/analysis_tools/get_flops.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py ================================================ FILE: mmdet-course/config/rtmdet_tiny_8xb32-300e_visdrone.py ================================================ _base_ = 'rtmdet_tiny_8xb32-300e_coco.py' model = dict( bbox_head=dict( num_classes=10 ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=16, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=16, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=16, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth' # nohup python tools/train.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py > rtmdet-tiny-visdrone.log 2>&1 & tail -f rtmdet-tiny-visdrone.log # python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py work_dirs/rtmdet_tiny_8xb32-300e_visdrone/epoch_300.pth --show --show-dir test_save # python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py work_dirs/rtmdet_tiny_8xb32-300e_visdrone/epoch_300.pth --tta # python tools/analysis_tools/get_flops.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py ================================================ FILE: mmdet-course/config/tood_r50_fpn_1x_visdrone.py ================================================ _base_ = './tood_r50_fpn_1x_coco.py' # 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 model = dict( bbox_head=dict( num_classes=10 ) ) # 修改数据集相关配置 data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } train_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'))) val_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'))) test_dataloader = dict( batch_size=8, num_workers=8, dataset=dict( data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'))) # 修改评价指标相关配置 val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json') test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json') # optim_wrapper = dict(type='AmpOptimWrapper') default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_from='tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth' # nohup python tools/train.py configs/tood/tood_r50_fpn_1x_visdrone.py > tood-visdrone.log 2>&1 & tail -f tood-visdrone.log # python tools/test.py configs/tood/tood_r50_fpn_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save # python tools/test.py configs/tood/tood_r50_fpn_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --tta ================================================ FILE: mmdet-course/config/yolox_tiny_8xb8-300e_visdrone.py ================================================ _base_ = './yolox_tiny_8xb8-300e_coco.py' # 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 model = dict( bbox_head=dict( num_classes=10 ) ) # 修改数据集相关配置 # dataset settings data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/' dataset_type = 'CocoDataset' metainfo = { 'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'), # 'palette': [ # (220, 20, 60), # ] } # Example to use different file client # Method 1: simply set the data root and let the file I/O module # automatically infer from prefix (not support LMDB and Memcache yet) # data_root = 's3://openmmlab/datasets/detection/coco/' # Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 # backend_args = dict( # backend='petrel', # path_mapping=dict({ # './data/': 's3://openmmlab/datasets/detection/', # 'data/': 's3://openmmlab/datasets/detection/' # })) backend_args = None img_scale = (640, 640) # width, height train_pipeline = [ dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), dict( type='RandomAffine', scaling_ratio_range=(0.1, 2), # img_scale is (width, height) border=(-img_scale[0] // 2, -img_scale[1] // 2)), dict( type='MixUp', img_scale=img_scale, ratio_range=(0.8, 1.6), pad_val=114.0), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip', prob=0.5), # According to the official implementation, multi-scale # training is not considered here but in the # 'mmdet/models/detectors/yolox.py'. # Resize and Pad are for the last 15 epochs when Mosaic, # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook. dict(type='Resize', scale=img_scale, keep_ratio=True), dict( type='Pad', pad_to_square=True, # If the image is three-channel, the pad value needs # to be set separately for each channel. pad_val=dict(img=(114.0, 114.0, 114.0))), dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), dict(type='PackDetInputs') ] train_dataset = dict( # use MultiImageMixDataset wrapper to support mosaic and mixup type='MultiImageMixDataset', dataset=dict( type=dataset_type, data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-train/annotations/train.json', data_prefix=dict(img='VisDrone2019-DET-train/images/'), pipeline=[ dict(type='LoadImageFromFile', backend_args=backend_args), dict(type='LoadAnnotations', with_bbox=True) ], filter_cfg=dict(filter_empty_gt=False, min_size=32), backend_args=backend_args), pipeline=train_pipeline) test_pipeline = [ dict(type='LoadImageFromFile', backend_args=backend_args), dict(type='Resize', scale=img_scale, keep_ratio=True), dict( type='Pad', pad_to_square=True, pad_val=dict(img=(114.0, 114.0, 114.0))), dict(type='LoadAnnotations', with_bbox=True), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] train_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=train_dataset) val_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-val/annotations/val.json', data_prefix=dict(img='VisDrone2019-DET-val/images/'), test_mode=True, pipeline=test_pipeline, backend_args=backend_args)) test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, metainfo=metainfo, ann_file='VisDrone2019-DET-test-dev/annotations/test.json', data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'), test_mode=True, pipeline=test_pipeline, backend_args=backend_args)) val_evaluator = dict( type='CocoMetric', ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json', metric='bbox', backend_args=backend_args) test_evaluator = dict( type='CocoMetric', ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json', metric='bbox', backend_args=backend_args) default_hooks = dict(logger=dict(type='LoggerHook', interval=200)) load_form='yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth' # nohup python tools/train.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py > yolox-tiny-visdrone.log 2>&1 & tail -f yolox-tiny-visdrone.log # python tools/test.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py work_dirs/yolox_tiny_8xb8-300e_visdrone/epoch_300.pth --show --show-dir test_save # python tools/test.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py work_dirs/yolox_tiny_8xb8-300e_visdrone/epoch_300.pth --tta # python tools/analysis_tools/get_flops.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py ================================================ FILE: mmdet-course/mmdet2yolo.py ================================================ import os, torch, cv2, math, tqdm, time, shutil, argparse, json, pickle import numpy as np from prettytable import PrettyTable def clip_boxes(boxes, shape): # Clip boxes (xyxy) to image shape (height, width) if isinstance(boxes, torch.Tensor): # faster individually boxes[..., 0].clamp_(0, shape[1]) # x1 boxes[..., 1].clamp_(0, shape[0]) # y1 boxes[..., 2].clamp_(0, shape[1]) # x2 boxes[..., 3].clamp_(0, shape[0]) # y2 else: # np.array (faster grouped) boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None): # Rescale boxes (xyxy) from img1_shape to img0_shape if ratio_pad is None: # calculate from img0_shape gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding else: gain = ratio_pad[0][0] pad = ratio_pad[1] boxes[..., [0, 2]] -= pad[0] # x padding boxes[..., [1, 3]] -= pad[1] # y padding boxes[..., :4] /= gain clip_boxes(boxes, img0_shape) return boxes def box_iou(box1, box2, eps=1e-7): """ Calculate intersection-over-union (IoU) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format. Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py Args: box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes. box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes. eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7. Returns: (torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2. """ # NOTE: Need .float() to get accurate iou values # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) (a1, a2), (b1, b2) = box1.float().unsqueeze(1).chunk(2, 2), box2.float().unsqueeze(0).chunk(2, 2) inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2) # IoU = inter / (area1 + area2 - inter) return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps) def process_batch(detections, labels, iouv): """ Return correct prediction matrix Arguments: detections (array[N, 6]), x1, y1, x2, y2, conf, class labels (array[M, 5]), class, x1, y1, x2, y2 Returns: correct (array[N, 10]), for 10 IoU levels """ correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) iou = box_iou(labels[:, 1:], detections[:, :4]) correct_class = labels[:, 0:1] == detections[:, 5] for i in range(len(iouv)): x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match if x[0].shape[0]: matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou] if x[0].shape[0] > 1: matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 1], return_index=True)[1]] # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] correct[matches[:, 1].astype(int), i] = True return torch.tensor(correct, dtype=torch.bool, device=iouv.device) def smooth(y, f=0.05): # Box filter of fraction f nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd) p = np.ones(nf // 2) # ones padding yp = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded return np.convolve(yp, np.ones(nf) / nf, mode='valid') # y-smoothed def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), eps=1e-16, prefix=''): """ Compute the average precision, given the recall and precision curves. Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. # Arguments tp: True positives (nparray, nx1 or nx10). conf: Objectness value from 0-1 (nparray). pred_cls: Predicted object classes (nparray). target_cls: True object classes (nparray). plot: Plot precision-recall curve at mAP@0.5 save_dir: Plot save directory # Returns The average precision as computed in py-faster-rcnn. """ # Sort by objectness i = np.argsort(-conf) tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] # Find unique classes unique_classes, nt = np.unique(target_cls, return_counts=True) nc = unique_classes.shape[0] # number of classes, number of detections # Create Precision-Recall curve and compute AP for each class px, py = np.linspace(0, 1, 1000), [] # for plotting ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000)) for ci, c in enumerate(unique_classes): i = pred_cls == c n_l = nt[ci] # number of labels n_p = i.sum() # number of predictions if n_p == 0 or n_l == 0: continue # Accumulate FPs and TPs fpc = (1 - tp[i]).cumsum(0) tpc = tp[i].cumsum(0) # Recall recall = tpc / (n_l + eps) # recall curve r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases # Precision precision = tpc / (tpc + fpc) # precision curve p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score # AP from recall-precision curve for j in range(tp.shape[1]): ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) if plot and j == 0: py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 # Compute F1 (harmonic mean of precision and recall) f1 = 2 * p * r / (p + r + eps) i = smooth(f1.mean(0), 0.1).argmax() # max F1 index p, r, f1 = p[:, i], r[:, i], f1[:, i] tp = (r * nt).round() # true positives fp = (tp / (p + eps) - tp).round() # false positives return tp, fp, p, r, f1, ap, unique_classes.astype(int) def compute_ap(recall, precision): """ Compute the average precision, given the recall and precision curves # Arguments recall: The recall curve (list) precision: The precision curve (list) # Returns Average precision, precision curve, recall curve """ # Append sentinel values to beginning and end mrec = np.concatenate(([0.0], recall, [1.0])) mpre = np.concatenate(([1.0], precision, [0.0])) # Compute the precision envelope mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) # Integrate area under curve method = 'interp' # methods: 'continuous', 'interp' if method == 'interp': x = np.linspace(0, 1, 101) # 101-point interp (COCO) ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate else: # 'continuous' i = np.where(mrec[1:] != mrec[:-1])[0] # points where x axis (recall) changes ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve return ap, mpre, mrec def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument('--label_coco', type=str, default='/home/hjj/Desktop/dataset/dataset_visdrone/test_coco.json', help='label coco path') parser.add_argument('--pred_coco', type=str, default='runs/val/exp/predictions.json', help='pred coco path') # parser.add_argument('--pred_coco', type=str, default='/home/hjj/Desktop/github_code/mmdetection-visdrone/work_dirs/dino-4scale_r50_8xb2-12e_visdrone/test/prediction.pickle', help='pred coco path') parser.add_argument('--iou', type=float, default=0.7, help='iou threshold') parser.add_argument('--conf', type=float, default=0.001, help='conf threshold') opt = parser.parse_known_args()[0] return opt if __name__ == '__main__': opt = parse_opt() iouv = torch.linspace(0.5, 0.95, 10) # iou vector for mAP@0.5:0.95 niou = iouv.numel() stats = [] label_coco_json_path, pred_coco_json_path = opt.label_coco, opt.pred_coco with open(label_coco_json_path) as f: label = json.load(f) classes = [] for data in label['categories']: classes.append(data['name']) image_id_hw_dict = {} for data in label['images']: image_id_hw_dict[data['id']] = [data['height'], data['width']] label_id_dict = {} for data in tqdm.tqdm(label['annotations'], desc='Process label...'): if data['image_id'] not in label_id_dict: label_id_dict[data['image_id']] = [] category_id = data['category_id'] x_min, y_min, w, h = data['bbox'][0], data['bbox'][1], data['bbox'][2], data['bbox'][3] x_max, y_max = x_min + w, y_min + h label_id_dict[data['image_id']].append(np.array([int(category_id), x_min, y_min, x_max, y_max])) if pred_coco_json_path.endswith('json'): with open(pred_coco_json_path) as f: pred = json.load(f) pred_id_dict = {} for data in tqdm.tqdm(pred, desc='Process pred...'): if data['image_id'] not in pred_id_dict: pred_id_dict[data['image_id']] = [] score = data['score'] category_id = data['category_id'] x_min, y_min, w, h = data['bbox'][0], data['bbox'][1], data['bbox'][2], data['bbox'][3] x_max, y_max = x_min + w, y_min + h pred_id_dict[data['image_id']].append(np.array([x_min, y_min, x_max, y_max, float(score), int(category_id)])) else: with open(pred_coco_json_path, 'rb') as f: pred = pickle.load(f) pred_id_dict = {} for data in tqdm.tqdm(pred, desc='Process pred...'): image_id = os.path.splitext(os.path.basename(data['img_path']))[0] if image_id not in pred_id_dict: pred_id_dict[image_id] = [] for i in range(data['pred_instances']['labels'].size(0)): score = data['pred_instances']['scores'][i] category_id = data['pred_instances']['labels'][i] bboxes = data['pred_instances']['bboxes'][i] x_min, y_min, x_max, y_max = bboxes.cpu().detach().numpy() # x_min, x_max = x_min / data['scale_factor'][0], x_max / data['scale_factor'][0] # y_min, y_max = y_min / data['scale_factor'][1], y_max / data['scale_factor'][1] pred_id_dict[image_id].append(np.array([x_min, y_min, x_max, y_max, float(score), int(category_id)])) for idx, image_id in enumerate(tqdm.tqdm(list(image_id_hw_dict.keys()), desc="Cal mAP...")): label = np.array(label_id_dict[image_id]) if image_id not in pred_id_dict: pred = np.empty((0, 6)) else: pred = torch.from_numpy(np.array(pred_id_dict[image_id])) nl, npr = label.shape[0], pred.shape[0] correct = torch.zeros(npr, niou, dtype=torch.bool) if npr == 0: if nl: stats.append((correct, *torch.zeros((2, 0)), torch.from_numpy(label[:, 0]))) continue if nl: correct = process_batch(pred, torch.from_numpy(label), iouv) stats.append((correct, pred[:, 4], pred[:, 5], torch.from_numpy(label[:, 0]))) stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*stats)] tp, fp, p, r, f1, ap, ap_class = ap_per_class(*stats) print(f'precision:{p}') print(f'recall:{r}') print(f'mAP@0.5:{ap[:, 0]}') table = PrettyTable() table.title = f"Metrice" table.field_names = ["Classes", 'Precision', 'Recall', 'mAP50', 'mAP50-95'] table.add_row(['all', f'{np.mean(p):.3f}', f'{np.mean(r):.3f}', f'{np.mean(ap[:, 0]):.3f}', f'{np.mean(ap):.3f}']) for cls_idx, classes in enumerate(classes): table.add_row([classes, f'{p[cls_idx]:.3f}', f'{r[cls_idx]:.3f}', f'{ap[cls_idx, 0]:.3f}', f'{ap[cls_idx, :].mean():.3f}']) print(table) ================================================ FILE: mmdet-course/readme.md ================================================ # mmdet使用教程 ### mmdet教程命令 1. conda create -n mmdet_py39 python=3.9 anaconda 2. https://mmdetection.readthedocs.io/en/latest/get_started.html 3. https://pytorch.org/get-started/previous-versions/ pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 4. https://mmdetection.readthedocs.io/zh-cn/latest/user_guides/train.html#id7 ### mmdet运行命令 1. 训练 python tools/train.py 2. 测试 python tools/test.py --out 3. 计算量、参数量计算脚本 python tools/analysis_tools/get_flops.py 4. 推理时间、fps、gpu memory计算脚本 python tools/analysis_tools/benchmark.py --checkpoint --task inference --fuse-conv-bn 5. 绘制曲线图脚本 python tools/analysis_tools/analyze_logs.py plot_curve --keys --legend --out 6. 结果分析脚本 python tools/analysis_tools/analyze_results.py ### mmdet视频教程链接(可按顺序观看) 1. [一库打尽目标检测对比实验!mmdetection环境、训练、测试手把手教程!](https://www.bilibili.com/video/BV1xA4m1c7H8/) 2. [一库打尽目标检测对比实验!mmdetection参数量、计算量、FPS、绘制logs手把手教程](https://www.bilibili.com/video/BV17C41137dW/) 3. [一库打尽目标检测对比实验!mmdetection指标转换YOLO指标!](https://www.bilibili.com/video/BV1AWtCesEc6/) ### mmdet实验数据(指标均为COCO指标) 以下实验数据环境: python:3.9.19 torch:2.1.0+cu121 torchvision:0.16.0 mmdet:3.3.0 mmcv:2.1.0 mmengine:0.10.3 硬件环境: Platform:Ubuntu CPU:i7-12700K RAM:32G GPU:RTX3090 #### VisDrone2019-testset | model | Input Shape | GFlops | Params | coco/bbox_mAP | coco/bbox_mAP_50 | coco/bbox_mAP_s | coco/bbox_mAP_m | coco/bbox_mAP_l | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | Faster-RCNN-R50-FPN-CIOU | (768, 1344) | 208G | 41.39M | 0.194 | 0.329 | 0.095 | 0.309 | 0.429 | | Cascade-RCNN-R50-FPN | (768, 1344) | 236G | 69.29M | 0.197 | 0.326 | 0.099 | 0.309 | 0.406 | | ATSS-R50-FPN-DyHead | (768, 1344) | 110G | 38.91M | 0.204 | 0.338 | 0.100 | 0.317 | 0.485 | | TOOD-R50 | (768, 1344) | 199G | 32.04M | 0.204 | 0.339 | 0.102 | 0.317 | 0.403 | | DINO | (750, 1333) | 274G | 47.56M | 0.253 | 0.445 | 0.150 | 0.371 | 0.503 | | DDQ | (768, 1333) | - | - | 0.268 | 0.463 | 0.159 | 0.390 | 0.526 | | YOLOX-Tiny | (640, 640) | 7.578G | 5.035M | 0.148 | 0.278 | 0.076 | 0.221 | 0.278 | | GFL | (768, 1344) | 206G | 32.279M | 0.193 | 0.321 | 0.094 | 0.300 | 0.409 | | RTMDet-Tiny | (640, 640) | 8.033G | 4.876M | 0.184 | 0.312 | 0.077 | 0.288 | 0.445 | | RetinaNet-R50-FPN | (768, 1344) | 210G | 36.517M | 0.164 | 0.276 | 0.060 | 0.274 | 0.427 | ================================================ FILE: mmdet-course/yolo2coco.py ================================================ import os import cv2 import json from tqdm import tqdm from sklearn.model_selection import train_test_split import argparse # python yolo2coco.py --root_dir VisDrone2019-DET-train --save_path train.json # python yolo2coco.py --root_dir VisDrone2019-DET-val --save_path val.json # python yolo2coco.py --root_dir VisDrone2019-DET-test-dev --save_path test.json parser = argparse.ArgumentParser() parser.add_argument('--root_dir', default='./dataset/valid',type=str, help="root path of images and labels, include ./images and ./labels and classes.txt") parser.add_argument('--save_path', type=str,default='./valid.json', help="if not split the dataset, give a path to a json file") parser.add_argument('--random_split', action='store_true', help="random split the dataset, default ratio is 8:1:1") parser.add_argument('--split_by_file', action='store_true', help="define how to split the dataset, include ./train.txt ./val.txt ./test.txt ") arg = parser.parse_args() def train_test_val_split_random(img_paths,ratio_train=0.8,ratio_test=0.1,ratio_val=0.1): # 这里可以修改数据集划分的比例。 assert int(ratio_train+ratio_test+ratio_val) == 1 train_img, middle_img = train_test_split(img_paths,test_size=1-ratio_train, random_state=233) ratio=ratio_val/(1-ratio_train) val_img, test_img =train_test_split(middle_img,test_size=ratio, random_state=233) print("NUMS of train:val:test = {}:{}:{}".format(len(train_img), len(val_img), len(test_img))) return train_img, val_img, test_img def train_test_val_split_by_files(img_paths, root_dir): # 根据文件 train.txt, val.txt, test.txt(里面写的都是对应集合的图片名字) 来定义训练集、验证集和测试集 phases = ['train', 'val', 'test'] img_split = [] for p in phases: define_path = os.path.join(root_dir, f'{p}.txt') print(f'Read {p} dataset definition from {define_path}') assert os.path.exists(define_path) with open(define_path, 'r') as f: img_paths = f.readlines() # img_paths = [os.path.split(img_path.strip())[1] for img_path in img_paths] # NOTE 取消这句备注可以读取绝对地址。 img_split.append(img_paths) return img_split[0], img_split[1], img_split[2] def yolo2coco(arg): root_path = arg.root_dir print("Loading data from ",root_path) assert os.path.exists(root_path) originLabelsDir = os.path.join(root_path, 'labels') originImagesDir = os.path.join(root_path, 'images') with open(os.path.join(root_path, 'classes.txt')) as f: classes = f.read().strip().split() # images dir name indexes = os.listdir(originImagesDir) if arg.random_split or arg.split_by_file: # 用于保存所有数据的图片信息和标注信息 train_dataset = {'categories': [], 'annotations': [], 'images': []} val_dataset = {'categories': [], 'annotations': [], 'images': []} test_dataset = {'categories': [], 'annotations': [], 'images': []} # 建立类别标签和数字id的对应关系, 类别id从0开始。 for i, cls in enumerate(classes, 0): train_dataset['categories'].append({'id': i, 'name': cls, 'supercategory': 'mark'}) val_dataset['categories'].append({'id': i, 'name': cls, 'supercategory': 'mark'}) test_dataset['categories'].append({'id': i, 'name': cls, 'supercategory': 'mark'}) if arg.random_split: print("spliting mode: random split") train_img, val_img, test_img = train_test_val_split_random(indexes,0.8,0.1,0.1) elif arg.split_by_file: print("spliting mode: split by files") train_img, val_img, test_img = train_test_val_split_by_files(indexes, root_path) else: dataset = {'categories': [], 'annotations': [], 'images': []} for i, cls in enumerate(classes, 0): dataset['categories'].append({'id': i, 'name': cls, 'supercategory': 'mark'}) # 标注的id ann_id_cnt = 0 for k, index in enumerate(tqdm(indexes)): # 支持 png jpg 格式的图片。 txtFile = index.replace('images','txt').replace('.jpg','.txt').replace('.png','.txt') # 读取图像的宽和高 im = cv2.imread(os.path.join(root_path, 'images/') + index) height, width, _ = im.shape if arg.random_split or arg.split_by_file: # 切换dataset的引用对象,从而划分数据集 if index in train_img: dataset = train_dataset elif index in val_img: dataset = val_dataset elif index in test_img: dataset = test_dataset # 添加图像的信息 dataset['images'].append({'file_name': index, 'id': k, 'width': width, 'height': height}) if not os.path.exists(os.path.join(originLabelsDir, txtFile)): # 如没标签,跳过,只保留图片信息。 continue with open(os.path.join(originLabelsDir, txtFile), 'r') as fr: labelList = fr.readlines() for label in labelList: label = label.strip().split() x = float(label[1]) y = float(label[2]) w = float(label[3]) h = float(label[4]) # convert x,y,w,h to x1,y1,x2,y2 H, W, _ = im.shape x1 = (x - w / 2) * W y1 = (y - h / 2) * H x2 = (x + w / 2) * W y2 = (y + h / 2) * H # 标签序号从0开始计算, coco2017数据集标号混乱,不管它了。 cls_id = int(label[0]) width = max(0, x2 - x1) height = max(0, y2 - y1) dataset['annotations'].append({ 'area': width * height, 'bbox': [x1, y1, width, height], 'category_id': cls_id, 'id': ann_id_cnt, 'image_id': k, 'iscrowd': 0, # mask, 矩形是从左上角点按顺时针的四个顶点 'segmentation': [[x1, y1, x2, y1, x2, y2, x1, y2]] }) ann_id_cnt += 1 # 保存结果 folder = os.path.join(root_path, 'annotations') if not os.path.exists(folder): os.makedirs(folder) if arg.random_split or arg.split_by_file: for phase in ['train','val','test']: json_name = os.path.join(root_path, 'annotations/{}.json'.format(phase)) with open(json_name, 'w') as f: if phase == 'train': json.dump(train_dataset, f) elif phase == 'val': json.dump(val_dataset, f) elif phase == 'test': json.dump(test_dataset, f) print('Save annotation to {}'.format(json_name)) else: json_name = os.path.join(root_path, 'annotations/{}'.format(arg.save_path)) with open(json_name, 'w') as f: json.dump(dataset, f) print('Save annotation to {}'.format(json_name)) if __name__ == "__main__": yolo2coco(arg) ================================================ FILE: module-info/CVPR2023-SMPConv.md ================================================ # SMPConv模块总结 https://arxiv.org/pdf/2304.02330 ## 1. 背景 ### 连续卷积的兴起 连续卷积因其处理不规则采样数据和建模长期依赖关系的能力而备受关注[1]。随着大型卷积核在实验中展现出优异结果,连续卷积因能高效构建大型核而获得进一步发展[1]。 ### 现有方法的局限性 目前主流的连续卷积实现方法是使用多层感知机(MLP)作为神经场来生成核值[1][2]。然而,这种方法存在几个关键问题: - **计算开销大**:每次训练迭代都需要多次MLP的前向和反向传播来生成核并更新参数[1][2] - **超参数调优复杂**:需要调整激活函数、宽度、深度等大量架构变化[2][3] - **滤波器描述能力有限**:受到架构先验的严重影响[2][3] - **频谱偏差问题**:MLP训练中存在的频谱偏差影响性能[3] ### 大规模应用的挑战 由于计算复杂度高,基于MLP的方法难以应用于ImageNet等大规模问题[1][2]。 ## 2. 模块原理 ### 核心设计思想 SMPConv提出使用**自移动点表示**和**插值方案**来实现连续函数,完全避免使用神经网络[3][6]。 ### 数学表示 SMPConv将连续核函数定义为: ``` SMP(x; φ) = (1/|N(x)|) Σ g(x, pi, ri)wi ``` 其中: - `φ = {{pi}, {wi}, {ri}}` 是可学习参数集合[7] - `pi ∈ Rd` 是自移动点的坐标[7] - `wi ∈ RNc` 是点的权重参数[7] - `ri ∈ R+` 是可学习的半径[7] ### 距离函数 使用L1距离定义邻域影响: ``` g(x, pi, ri) = 1 - ||x - pi||1/ri ``` 只有在一定距离范围内的点才会影响查询点[7]。 ### 关键特性 #### 自移动机制 - **坐标可学习**:点坐标`{pi}`在训练过程中更新,实现"移动"[7] - **自适应分布**:更多点可聚集在高频区域,少量点可表示低频成分[7] - **参数效率**:单个点可能足以近似单峰函数[3] #### 插值实现连续性 - 通过加权平均邻近点表示生成输出向量[7] - 在任意查询位置通过插值实现无限分辨率[3] ### 参数共享策略 在卷积层中,每个滤波器的所有通道共享位置参数,但拥有独立的权重参数[7][8]。这提供了合理的先验:卷积滤波器可以专注于输入域的特定区域[8]。 ## 3. 解决了什么问题 ### 3.1 计算效率问题 **问题**:MLP方法需要大量前向和反向传播计算[1][2] **解决方案**: - 仅使用点表示和插值,无需神经网络[3][4] - 训练速度比FlexConv快7倍以上[9] - 比Deformable Conv快2.5倍[9] ### 3.2 参数效率问题 **问题**:传统离散卷积参数数量随核大小平方增长[9] **解决方案**: - 参数数量为`(1 + d + C)Np`,与核分辨率无关[9] - 使用`Np ≪ N²`个点表示任意大小的核[9] - 固定参数预算下构建大型核[3][5] ### 3.3 频谱偏差问题 **问题**:MLP训练中的频谱偏差降低性能[3][4] **解决方案**: - 每个点表示覆盖输入域的局部区域[3] - 点独立更新,不影响整个输入域[3] - 邻近点的高度不同值可轻松表达高频成分[3] ### 3.4 架构复杂性问题 **问题**:MLP方法需要复杂的超参数搜索[2][3] **解决方案**: - 移除了新引入神经网络的超参数调优负担[4] - 可作为现有框架的即插即用替换[3] - 最小化架构先验[3] ### 3.5 大规模应用问题 **问题**:现有连续卷积方法无法处理ImageNet规模数据[2][5] **解决方案**: - 首次在ImageNet上成功应用连续卷积[5][13] - 在大规模设置中展示了相对于现有技术的改进[1] ### 3.6 表达能力限制问题 **问题**:现有方法的滤波器描述能力受限[2][3] **解决方案**: - 每个滤波器有独立参数,提供更多自由度[7][8] - 点可自由移动到最优位置[7] - 能够学习自适应的大型感受野[15] 通过这些创新,SMPConv成功地将连续卷积从概念验证阶段推进到实际大规模应用,为深度学习中的卷积操作提供了一个高效、实用的替代方案。 ================================================ FILE: module-info/CVPR2024-DCMPNet.md ================================================ # LEGM和MFM模块详细总结 https://arxiv.org/pdf/2403.01105 ## LEGM模块 (Local Feature-embedded Global Feature Extraction Module) ### 1. 背景 在图像去雾任务中,传统的卷积神经网络主要擅长提取局部特征,但在处理全局信息和长距离依赖关系方面存在局限性[7]。为了有效融合局部和全局特征信息,提高去雾网络的特征表示能力,作者设计了LEGM模块。 ### 2. 模块原理 LEGM模块的核心组件是自注意力块(self-attention block)[7],其输入包括: - U-Net输出经过1×1卷积后的特征 - 经过3×3卷积的特征 - 深度估计网络(DE)后经过DRDB处理的特征 **工作机制**: - 将卷积层与自注意力块相结合,命名为LEGM[7] - 在深度信息辅助去雾中,只有第一个LEGM接收雾霾图像的深度信息[7] - 去雾网络编码器包含三个LEGM,其输出通过MSAAM进行整合以防止浅层特征丢失[7] ### 3. 解决的问题 - **局部-全局特征融合**:有效结合了卷积网络的局部特征提取能力和自注意力机制的全局建模能力 - **特征表示增强**:显著提升了网络的特征表示能力,消融实验显示相比基线模型PSNR提升了4.72dB[13] - **深度信息集成**:为深度信息的有效利用提供了合适的特征融合机制 --- ## MFM模块 (Modulation Fusion Module) ### 1. 背景 在去雾网络的解码过程中,需要有效融合来自不同层次和不同来源的特征信息。传统的特征融合方法(如简单相加或拼接)无法自适应地调整不同特征的重要性,可能导致关键信息被稀释或丢失[8]。 ### 2. 模块原理 MFM模块采用动态权重调制的特征融合策略[8]: **输入处理**: - 第一个MFM的输入是F̂¹ₗₑₘ和经过3×3卷积处理的特征F¹ᵣc - 将F̂¹ₗₑₘ和F¹ᵣc相加后,经过GAP(全局平均池化)、MLP和Softmax处理,得到权重矩阵A¹ᵣ,c[8] **特征调制**: 权重矩阵A¹ᵣ,c中的数值表示F̂¹ₗₑₘ和F¹ᵣc在去雾图像重建中的重要性程度。通过A¹ᵣ,c进行调制的具体过程为[8]: ``` F̃¹ᵣc = A¹ᵣ,c ⊙ F̂¹ₗₑₘ + A¹ᵣ,c ⊙ F¹ᵣc ``` **特征整合**: - 将F̃¹ᵣc和F̂¹ₗₑₘ进行拼接以增强它们之间的共享信息 - 拼接结果经过卷积层处理,产生第一个带FMI的LEGM输出[8] ### 3. 解决的问题 - **自适应特征融合**:通过动态调整融合权重,突出对去雾重建贡献更大的特征信息 - **特征表示增强**:提升网络的特征表示能力,消融实验显示在LEGM基础上进一步改善了模型性能[13] - **信息保持**:通过权重调制机制,确保重要的特征信息在融合过程中得到保留和强化 - **跨通道特征交互**:促进不同通道间的特征交互,提高整体的特征表达能力[14] --- ## 模块协同作用 LEGM和MFM模块在整个网络架构中形成了有效的协同作用: - **LEGM**负责局部-全局特征的有效提取和融合 - **MFM**负责不同特征间的自适应融合和调制 - 两个模块共同构成了去雾网络解码器中的核心组件,实现了高质量的特征重建和图像恢复[8] ================================================ FILE: module-info/CVPR2024-FADC.md ================================================ ### **FADC模块总结** https://arxiv.org/pdf/2403.05369 #### **1. 背景** 膨胀卷积(Dilated Convolution)通过插入间隔增加感受野,广泛应用于语义分割和目标检测任务。然而,传统膨胀卷积存在以下问题: - **高频信息丢失**:膨胀率增大导致卷积核的频率响应下降,限制了高频细节的捕获能力。[1][3][7] - **伪影问题**:当特征图的高频分量超过膨胀卷积的采样率时,会产生网格伪影(Gridding Artifacts)。[1][6][16] - **固定膨胀率的局限性**:传统方法使用全局固定的膨胀率,无法适应输入特征的局部变化,导致感受野与带宽的平衡不足。[1][4][7] 为了解决这些问题,作者提出了**频率自适应膨胀卷积(Frequency-Adaptive Dilated Convolution, FADC)**,从频谱分析角度优化膨胀卷积的性能。 --- #### **2. 模块原理** FADC包含三个核心模块,分别从膨胀率、卷积核权重和频率分量平衡三个方面进行改进: 1. **自适应膨胀率(Adaptive Dilation Rate, AdaDR)** - **动态调整膨胀率**:根据特征图的局部频率动态分配膨胀率。在高频区域(如边界),采用小膨胀率以捕获更多细节;在低频区域(如背景),采用大膨胀率以扩展感受野。[3][7][8] - **优化目标**:通过最大化感受野并最小化高频信息损失,平衡膨胀率与频率带宽。[7][8] 2. **自适应卷积核(Adaptive Kernel, AdaKern)** - **卷积核参数分解**:将卷积核权重分解为低频部分(平均值)和高频部分(残差)。[9] - **动态权重调整**:通过轻量级模块(全局池化+卷积层)动态调整高频和低频分量的比例,增强高频特征的捕获能力,提高有效带宽。[9][15] 3. **频率选择模块(Frequency Selection, FreqSelect)** - **频率分解**:将特征图分解为不同频段(如低频到高频),并通过二值掩码提取对应频率分量。[9][15] - **空间重加权**:根据输入特征的频率分布,动态调整不同频段的权重。通过抑制背景和对象中心的高频分量,鼓励网络学习更大的膨胀率,从而扩展感受野。[9][16] --- #### **3. 解决了什么问题** 1. **高频信息丢失** - AdaDR通过动态调整膨胀率,在高频区域保留更多细节,避免高频信息丢失。 - AdaKern增强高频分量的卷积响应能力,提高了特征图的高频信息捕获。[3][9] 2. **伪影问题(Gridding Artifacts)** - 通过动态调整膨胀率,FADC避免了特征频率超过采样率的情况,从而有效缓解伪影问题。[1][7][16] 3. **感受野与带宽的平衡不足** - AdaDR在局部动态分配膨胀率,优化了感受野与带宽的平衡。 - FreqSelect通过频率分量的空间重加权,进一步扩大了感受野,同时保留了关键的高频信息。[7][15][16] 4. **适配性与通用性不足** - FADC无需全局固定膨胀率,能够适应输入特征的局部变化,提高了网络的适配性。 - 模块设计轻量化,可无缝替换现有卷积层,适用于语义分割、目标检测等多种任务。[13][14] --- FADC通过频率视角优化膨胀卷积,提出的三大模块使其在捕获高频细节、扩展感受野以及解决伪影问题方面表现卓越,显著提升了语义分割和目标检测的性能。 ================================================ FILE: module-info/CVPR2024-PKINet.md ================================================ ### **PKI Module总结** https://openaccess.thecvf.com/content/CVPR2024/papers/Cai_Poly_Kernel_Inception_Network_for_Remote_Sensing_Detection_CVPR_2024_paper.pdf #### **1. 背景** 遥感目标检测任务中,目标尺度变化大(如小型车辆与大型建筑物)、背景复杂且上下文信息多样化。现有方法通过以下方式扩展感受野来解决问题: - **大核卷积**:用于捕获更多上下文信息,但容易引入背景噪声,影响小目标检测。 - **膨胀卷积**:扩大感受野,但可能导致特征表示过于稀疏,丢失细节信息。 这些方法未能有效处理目标尺度变化,同时保持局部纹理特征的完整性。[1][3] --- #### **2. 模块原理** PKI Module是一个**Inception风格**的模块,专为捕获多尺度纹理特征而设计,由以下部分组成: 1. **局部信息提取**: - 使用一个小核卷积(如3×3)提取局部纹理特征,确保捕获目标的细节信息。 - 数学表示: \[ L_{l-1,n} = \text{Conv}_{k_s \times k_s}(X_{l-1,n}) \] 其中,\( k_s \) 为小核大小(如3×3)。 2. **多尺度特征提取**: - 通过多个并行的**深度卷积核**(kernel size如5×5、7×7、9×9等)捕获不同尺度的上下文信息。 - 数学表示: \[ Z_{l-1,n}^{(m)} = \text{DWConv}_{k(m) \times k(m)}(L_{l-1,n}) \] 其中,\( k(m) = (m+1) \times 2 + 1 \),表示不同尺度的卷积核。 3. **特征融合**: - 将局部特征与多尺度特征通过1×1卷积进行通道融合,整合多尺度信息。 - 数学表示: \[ P_{l-1,n} = \text{Conv}_{1 \times 1}(L_{l-1,n} + \sum_{m=1}^{4} Z_{l-1,n}^{(m)}) \] - 这种融合机制确保了在不同尺度下捕获丰富的上下文信息,同时保持局部纹理特征的完整性。[6][7] --- #### **3. 解决了什么问题** PKI Module通过多尺度卷积核设计,解决了以下问题: 1. **目标尺度变化问题**: - 不同大小的卷积核能够捕获从小到大的目标特征,适应遥感图像中目标尺度跨度大的特性。 2. **背景噪声问题**: - 避免使用大核卷积,减少背景噪声对小目标检测的干扰。 3. **稀疏特征问题**: - 不使用膨胀卷积,避免特征表示稀疏导致的细节丢失,确保特征密度和完整性。 通过以上设计,PKI Module能够有效捕获**局部与多尺度上下文信息**,提升遥感图像目标检测的性能。[3][7][18] ================================================ FILE: module-info/CVPR2024-ParameterNet.md ================================================ # DynamicConv模块总结 https://arxiv.org/pdf/2306.14525v2 ## 1. 背景 ### 问题背景 在大规模视觉预训练中,研究者发现了"低FLOPs陷阱"现象:低FLOPs模型无法从大规模预训练数据中获益,而高FLOPs模型却能显著受益[1][2]。传统的解决方案是增加模型规模,但这会同时增加参数数量和计算复杂度(FLOPs),不适合移动设备等资源受限的场景[1]。 ### 设计需求 为了让低FLOPs模型也能从大规模预训练中受益,需要一种能够: - **大幅增加参数数量**以提升模型容量 - **几乎不增加FLOPs**以保持计算效率 - 适用于资源受限环境的技术方案[2][6] ## 2. 模块原理 ### 核心思想 DynamicConv通过**参数增强函数**实现"参数多、计算少"的目标: ``` W' = f(W) ``` 该函数需满足两个基本规则:1)计算成本低;2)大幅增加模型容量[6]。 ### 技术实现 **标准卷积**: ``` Y = X * W ``` 其中X ∈ R^(Cin×H×W)是输入特征,W ∈ R^(Cout×Cin×K×K)是权重张量[6]。 **动态卷积**: ``` Y = X * W' W' = Σ(i=1 to M) αi * Wi ``` 其中: - Wi是第i个卷积权重张量(共M个专家) - αi是对应的动态系数 - 系数根据不同输入样本动态生成[6][7] ### 动态系数生成机制 ``` α = softmax(MLP(Pool(X))) ``` 具体步骤: 1. 对输入X进行**全局平均池化**融合信息 2. 通过**两层MLP模块**处理 3. 使用**softmax激活**产生动态系数α ∈ R^M[7] ### 复杂度分析 **参数数量**: - 标准卷积:Cout · Cin · K · K - 动态卷积:C²in + CinM + M · Cout · Cin · K · K - **参数比例**:≈ 1/K² + M ≈ M(当M ≪ CoutK², Cin ≈ Cout时)[8] **FLOPs计算**: - 系数生成:C²in + CinM(可忽略) - 权重融合:M · Cout · Cin · K · K - 卷积计算:H' · W' · Cout · Cin · K · K - **FLOPs比例**:≈ 1(当M ≪ H'W'时)[8] ## 3. 解决的问题 ### 主要解决的核心问题 1. **低FLOPs陷阱**:使低FLOPs模型能够从大规模预训练中获益,打破了"低计算量模型无法利用大数据"的限制[2][10] 2. **参数-计算效率权衡**:实现了参数数量的大幅增加(约M倍)而计算量几乎不变,解决了传统方法中参数和FLOPs高度耦合的问题[8] ### 具体效果验证 **性能提升**: - ParameterNet-600M在ImageNet-1K上达到81.6%准确率,超过Swin Transformer的80.9% - FLOPs仅为0.6G,远低于Swin-T的4.5G[2] - ImageNet-22K预训练相比ImageNet-1K训练提升约2%[10] **与替代方案对比**: 相比重参数化卷积(RepConv),DynamicConv的优势在于: - RepConv虽然增加训练参数,但推理时参数和FLOPs不变,模型容量未真正增加 - DynamicConv在推理时保持增加的参数,真正提升了模型容量,能从大规模预训练中获益[13] ### 应用价值 DynamicConv模块为移动设备和边缘计算场景提供了新的解决方案,使得资源受限的环境也能享受大规模预训练带来的性能提升,在准确率-延迟权衡方面表现优异[11][12]。 ================================================ FILE: module-info/CVPR2024-RMT.md ================================================ # RMT Block模块详细分析 https://arxiv.org/pdf/2309.11523 ## 1. 背景 ### Vision Transformer的局限性 传统的Vision Transformer (ViT)存在两个核心问题: - **缺乏显式空间先验**:Self-Attention机制本身不具备空间位置感知能力[1] - **二次计算复杂度**:全局信息建模时Self-Attention的计算成本随token数量二次增长[1][2] ### 现有解决方案的不足 现有方法如Swin Transformer使用窗口操作、NAT改变感受野形状等,虽然能部分解决问题,但都会破坏空间先验信息的完整性[2][6]。 ### RetNet的启发 RetNet在NLP领域使用基于距离的时间衰减矩阵为一维单向文本数据提供显式时间先验,这为视觉领域的改进提供了灵感[2][3]。 ## 2. 模块原理 ### RMT Block整体架构 根据图3所示,RMT Block包含以下核心组件[7]: - **Layer Normalization (LN)** - **Manhattan Self-Attention (MaSA)** - **Depth-wise Convolution (DWConv 3×3)** - **Feed-Forward Network (FFN)** ### Manhattan Self-Attention (MaSA)核心原理 #### 空间衰减矩阵设计 MaSA的核心是基于曼哈顿距离的二维双向空间衰减矩阵: ``` D²d_nm = γ^(|xn-xm|+|yn-ym|) ``` 其中: - `(xn, yn)`表示第n个token的二维坐标 - `γ`是衰减参数,控制距离衰减的强度 - 距离越远的token,注意力权重衰减越大[5] #### MaSA计算公式 ``` MaSA(X) = (Softmax(QK^T) ⊙ D²d)V ``` 这里`⊙`表示逐元素相乘,空间衰减矩阵直接调制注意力权重[5]。 #### 注意力分解机制 为了降低计算复杂度,MaSA采用沿图像两个轴的分解形式: ``` AttnH = Softmax(QHK^T_H) ⊙ DH AttnW = Softmax(QWK^T_W) ⊙ DW MaSA(X) = AttnH(AttnWV)^T ``` 其中: - `DH_nm = γ^|yn-ym|`表示垂直方向距离 - `DW_nm = γ^|xn-xm|`表示水平方向距离[6][7] ### 局部上下文增强 (LCE) 为了进一步增强局部表达能力,RMT Block集成了局部上下文增强模块: ``` Xout = MaSA(X) + LCE(V) ``` LCE使用5×5深度卷积来增强局部特征[7]。 ### 多头注意力的衰减参数设计 不同注意力头使用不同的γ值来控制感受野,使模型能够感知多尺度信息。对于第i个头: ``` γi = 1 - 2^(-a - (b-a)i/N) ``` 其中a、b控制感受野范围,N是头的总数[19]。 ## 3. 解决了什么问题 ### 问题1:显式空间先验缺失 **解决方案**:通过曼哈顿距离的空间衰减矩阵,为每个token提供了明确的空间位置感知能力。 - 近距离token获得更高注意力权重 - 远距离token注意力权重按距离衰减 - 提供了比传统位置编码更丰富的空间先验信息[3][5] ### 问题2:二次计算复杂度 **解决方案**:通过注意力分解将复杂度从O(N²)降低到O(N)。 - 分别计算水平和垂直方向的注意力 - 保持了与原始MaSA相同的感受野形状 - 不破坏空间衰减矩阵的完整性[6][7] ### 问题3:全局与局部信息平衡 **解决方案**:通过分阶段使用不同形式的MaSA实现最优平衡。 - 前三个阶段使用分解的MaSA处理大量token - 最后阶段使用完整MaSA进行精细建模 - LCE模块补充局部特征表达[7] ### 实验验证效果 消融实验证明了各组件的有效性: - **MaSA vs Vanilla Attention**:分类准确率提升0.8%,检测AP提升2.5%[15] - **分解形式的效率**:在保持性能的同时显著降低FLOPs[16] - **多任务优越性**:在图像分类、目标检测、实例分割和语义分割任务上都取得了最先进的结果[8][10][13][14] 通过这些创新设计,RMT Block成功地将RetNet的时间建模能力扩展到空间域,为视觉Transformer提供了一个既高效又具有强空间感知能力的核心模块。 ================================================ FILE: module-info/CVPR2024-RepVIT.md ================================================ ### RepViT Block模块总结 https://arxiv.org/pdf/2307.09283 #### 1. 背景 **原始问题**: - MobileNetV3采用的是传统的倒残差瓶颈结构,其中Token Mixer(空间信息融合)和Channel Mixer(通道交互)是耦合在一起的[6] - 具体来说,MobileNetV3 block包含1×1扩展卷积、3×3深度卷积(DW)和1×1投影层,这种设计使得空间和通道的处理混合在一起[6] - 轻量级ViT的成功很大程度上归因于其采用的MetaFormer架构,该架构将Token Mixer和Channel Mixer分离,这种设计被证明是有效的[6] **设计动机**: - 研究发现ViT的有效性主要来源于其通用的Token Mixer和Channel Mixer架构(即MetaFormer架构),而不是特定的Token Mixer[6] - 为了让轻量级CNN也能享受这种架构优势,需要在MobileNetV3中实现Token Mixer和Channel Mixer的分离[6] #### 2. 模块原理 **结构设计**: - **分离设计**:将原本耦合的Token Mixer和Channel Mixer进行分离 - Token Mixer:3×3深度卷积(DW),负责空间信息融合 - Channel Mixer:1×1卷积层,负责通道间的交互[6][7] - **层序调整**: - 将3×3 DW卷积前移,使其独立处理空间信息 - SE层(如果存在)也随之前移,放置在DW卷积之后,因为SE层依赖于空间信息交互[7] - **结构重参数化**: - 对DW层采用广泛使用的结构重参数化技术,在训练时使用多分支结构增强学习能力 - 在推理时可以将多分支合并为单一卷积,消除跳跃连接带来的计算和内存开销[7] **具体结构对比**: - **MobileNetV3 Block**:1×1扩展 → 3×3 DW → SE(可选)→ 1×1投影 - **RepViT Block**:3×3 DW → SE(可选)→ 1×1扩展 → 1×1投影[7] #### 3. 解决了什么问题 **性能提升**: - **架构优化**:通过分离Token Mixer和Channel Mixer,使模型能够更好地处理空间和通道信息,提升了模型的表达能力[6][7] **效率优化**: - **延迟降低**:RepViT block将MobileNetV3-L的延迟从1.01ms降低到0.81ms[7] - **推理优化**:结构重参数化技术在推理时消除了跳跃连接的计算开销,这对移动设备特别有利[7] **训练增强**: - **学习能力**:结构重参数化技术在训练时提供多分支结构,增强了模型的学习能力,同时在推理时保持单分支的效率[7] **架构统一**: - **设计一致性**:使轻量级CNN的架构与成功的轻量级ViT保持一致,为后续的优化提供了良好的基础[6] **注意**:虽然RepViT block在延迟上有显著改善,但初期会带来临时的性能下降(从71.5%降至68.3%),这通过后续的扩展比例调整和网络宽度增加得到了补偿[7] ================================================ FILE: module-info/CVPR2024-Rewrite the Stars.md ================================================ # StarBlocks模块总结 https://arxiv.org/pdf/2403.19967 ## 1. 背景 ### 传统网络设计的局限性 在深度学习发展历程中,大多数网络都基于**线性投影(卷积和线性层)与非线性激活函数的组合**[1]。虽然自注意力机制在NLP和计算机视觉中表现出色,但其二次复杂度限制了效率[1]。 ### 逐元素乘法的兴起 近年来,通过**逐元素乘法融合不同子空间特征**的学习范式逐渐受到关注[1]。相关工作如FocalNet、HorNet、VAN等都采用了这种"星操作",但缺乏深入的理论分析[1][2]。 ### 现有解释的不足 现有研究对星操作的解释主要基于直觉和假设[2]: - FocalNet认为星操作起调制或门控机制作用 - HorNet认为优势在于利用高阶特征 - VAN和Monarch Mixer将其归因于卷积注意力 这些解释缺乏全面分析和强有力证据[2]。 ## 2. 模块原理 ### 核心设计结构 StarBlocks采用简洁的设计philosophy[12][13]: ``` 输入 → 深度卷积(DW-Conv) → 全连接层1(FC) → 全连接层2(FC) → ReLU6激活 → 星操作(*) → 全连接层3(FC) → 深度卷积(DW-Conv) → 批归一化(BN) → 输出 ``` ### 数学原理 星操作的数学表达为:**(W₁ᵀX + B₁) * (W₂ᵀX + B₂)**[5] 通过重写可得到: ``` w₁ᵀx * w₂ᵀx = Σᵢ₌₁^(d+1) Σⱼ₌₁^(d+1) wᵢ¹wⱼ²xᵢxⱼ ``` 这产生了**(d+2)(d+1)/2 ≈ (d/√2)²个不同的项**,每个项都是输入的非线性组合[6]。 ### 多层堆叠效应 通过l层堆叠,隐式特征维度达到**(d/√2)^(2l)**[7][8]: - 第1层:R^((d/√2)²¹) - 第2层:R^((d/√2)²²) - 第l层:R^((d/√2)²ˡ) 例如,10层深度、128宽度的网络可获得约**90^1024维**的隐式特征空间[8]。 ### 与核函数的关系 星操作类似于**多项式核函数**[5]: - 多项式核:k(x₁,x₂) = (γx₁·x₂ + c)^d - 都能将输入映射到高维非线性空间 - 决策边界可视化证实了这种相似性[10] ## 3. 解决了什么问题 ### 3.1 高维特征表示问题 **传统解决方案的局限**: - 传统网络通过增加网络宽度(通道数)来获得高维特征[3] - 这种方式增加了计算开销和参数量 **StarBlocks的解决方案**: - 在**低维计算空间中获得高维隐式特征表示**[3] - 无需增加网络宽度即可实现维度扩展[6] ### 3.2 计算效率与性能的平衡 **问题**:高效网络设计中性能与计算复杂度的权衡 **解决效果**[14][15]: - StarNet-S4相比EdgeViT-XS准确率提升0.9%,速度快3倍 - 在相同延迟下,StarNet-S1比MobileOne-S0准确率高2.1% - 证明了星操作特别适合高效网络设计[3] ### 3.3 激活函数依赖问题 **传统认知**:激活函数是神经网络不可缺少的组件 **StarBlocks的突破**[10][11]: - 移除所有激活函数后,性能仅下降1.2%(从71.7%降至70.5%) - 而传统求和操作在相同条件下性能大幅下降33.8% - 为**无激活函数网络**开辟了新的研究方向 ### 3.4 网络设计复杂度问题 **传统高效网络的问题**:需要复杂的设计技巧和精细调参[3] **StarBlocks的优势**[13]: - 设计极其简洁,最小化人工干预 - 无需复杂的重参数化、注意力集成等技术 - 通过星操作的内在优势实现优异性能 ### 3.5 理论理解缺失问题 **现有问题**:对逐元素乘法有效性缺乏深入理论解释[2] **StarBlocks的贡献**: - 提供了**数学上严格的理论分析**[5][6][7] - 通过实验、理论和可视化方法验证了分析的正确性[9][10] - 为网络设计提供了**指导性框架**,避免盲目尝试[4] ## 总结 StarBlocks模块通过简洁的设计和深刻的理论洞察,解决了传统网络在高维特征表示、计算效率、激活函数依赖等方面的关键问题,为高效网络设计提供了新的paradigm和理论基础。 ================================================ FILE: module-info/CVPR2024-SFSConv.md ================================================ # SFS-Conv模块详细总结 https://openaccess.thecvf.com/content/CVPR2024/papers/Li_Unleashing_Channel_Potential_Space-Frequency_Selection_Convolution_for_SAR_Object_Detection_CVPR_2024_paper.pdf ## 1. 背景 ### 1.1 现有问题 传统深度卷积神经网络在SAR目标检测中存在以下关键问题[1][2]: - **特征冗余严重**:单个卷积层内提取的大量特征图表现出相似的模式,存在显著冗余[1][4] - **计算资源消耗巨大**:深度网络的成功严重依赖于密集的计算和存储资源,给资源受限环境的部署带来挑战[1] - **通用卷积不适配SAR特性**:现有的分组卷积、逐点卷积等并非专门为SAR目标检测任务设计[2] ### 1.2 SAR图像特殊性 SAR图像具有独特的成像特点[2]: - **高分辨率俯视视角**:大多数目标较小,常被斑点噪声遮挡 - **依赖周围环境信息**:仅凭外观难以识别目标,需要利用目标形状、方向等周围环境线索 - **频域信息重要**:SAR成像基于雷达系统与目标的相互作用,频域分析可分解回波信号的散射特性 ### 1.3 设计先验 基于SAR图像分析,提出两个重要设计先验[2]: - **目标自适应感受野**:SAR图像中目标尺度多样,固定感受野的检测器可能产生错误分类 - **频率特征关键作用**:SAR成像易受复杂背景干扰,仅凭空间信息难以区分目标特征和杂波噪声 ## 2. 模块原理 ### 2.1 整体架构 SFS-Conv采用**分流-感知-选择**三步策略[2][6]: ``` 输入特征 → 分流(Shunt) → 感知(Perceive) → 选择(Select) → 输出特征 ↓ ↓ ↓ 空间/频率 SPU/FPU CSU融合 ``` ### 2.2 分流策略(Shunt) 将输入特征图X ∈ R^(C×H×W)按比例α分为两部分[6]: - **空间方面**:X^s ∈ R^((1-α)C×H×W),提供空间信息 - **频率方面**:X^f ∈ R^(αC×H×W),补充频率特性 通过两个1×1逐点卷积分别调整X^s和X^f,使其更适合后续的空间和频率维度特征提取[6]。 ### 2.3 感知策略(Perceive) #### 2.3.1 空间感知单元(SPU) **核心思想**:动态建模不同尺度的上下文信息[6] **实现方法**: - 将空间特征X^s均匀分为n个特征图组X^s_g - 每组对应不同尺寸的卷积核K_g,核尺寸递增:k_(g+1) = k_g + 2, k_1 = 3[6] - 构建层次化残差连接,扩大感受野: ``` Y^s_g = { X^s_g * K_g, g = 1 (X^s_g + Y^s_(g-1)) * K_g, 1 < g ≤ n } ``` - 感受野递增公式:RF_(g+1) = RF_g + (k_(g+1) - 1)[6] #### 2.3.2 频率感知单元(FPU) **核心思想**:利用分数阶Gabor变换提取多尺度多方向的频率特征[7] **分数阶Gabor变换(FrGT)**: - 标准FrGT定义[8]: ``` G^α_s(p,q) = ∫ s(x)ḡ(x-q)B(p,x,α)dx ``` 其中B(x₁,x₂,α)是变换核,α = Pπ/2是变换角度 - **卷积分数阶Gabor核(FrGK)**:用FrGT滤波器调制普通卷积核[8]: ``` K^v_(i,u) = K_(i,o) * G(u,v) ``` **实现过程**: - 将频率特征X^f分为V组X^f_v - 每组使用N = C/VU个卷积核生成对应频率特征 - 最终连接所有组:Y^f = [Y^f_0, Y^f_1, ..., Y^f_(V-1)][8] ### 2.4 选择策略(Select) #### 2.4.1 通道选择单元(CSU) **目标**:自适应融合空间和频率特征,选择最具区分性的信息[9] **实现步骤**: 1. **全局平均池化**:收集空间和频率的全局信息[9] ``` S^n = GAP(Y^n) = (1/(H×W)) ∑∑ Y^n_(i,j) ``` 2. **软注意力权重生成**[9]: ``` γ = e^(S^s)/(e^(S^s) + e^(S^f)) β = e^(S^f)/(e^(S^s) + e^(S^f)) ``` 3. **特征融合**[9]: ``` Y = γY^s + βY^f ``` ## 3. 解决的问题 ### 3.1 特征冗余问题 **问题**:传统卷积产生大量相似的特征图,造成计算资源浪费[1][4] **解决方案**: - 通过分流策略将特征分为空间和频率两个互补方面,避免重复提取相似特征[2] - SPU的多尺度设计和FPU的多方向特征提取增加了特征多样性[6][7] - 实验显示相比普通卷积,SFS-Conv的特征图展现出更大的多样性和区分性[1] ### 3.2 SAR图像特性适配问题 **问题**:通用卷积设计未考虑SAR图像的独特特性[2] **解决方案**: - **空间适配**:SPU的动态感受野适应SAR图像中目标的多样尺度[6] - **频率适配**:FPU专门提取SAR成像机制产生的频域散射特性[7][8] - **噪声抑制**:分数阶Gabor变换有效抑制SAR图像中的斑点噪声[7] ### 3.3 计算效率问题 **问题**:现有方法通过增加注意力模块提升性能,但增加了模型复杂度[2] **解决方案**: - **参数高效**:CSU采用无参数融合方式,不增加额外参数[9] - **计算优化**:相比YOLOv8s仅使用18%参数和24%FLOPs[3] - **推理加速**:推理时间仅8.6ms,比YOLOv8s节省39%时间[12] ### 3.4 性能与效率平衡问题 **问题**:现有方法要么追求轻量化导致性能下降,要么提升性能但计算开销大[2] **解决方案**: - 在三个SAR数据集上都取得了最优性能:HRSID(96.2%)、SAR-AIRcraft-1.0(89.7%)、SSDD(99.6%)[3] - 同时保持极低的计算复杂度和推理时间[12] - 消融实验证明各组件的有效性和必要性[15][16] SFS-Conv模块通过创新的分流-感知-选择策略,在单个卷积层内实现了空间和频率特征的有效提取与融合,为SAR目标检测提供了高效、轻量化的解决方案。 ================================================ FILE: module-info/CVPR2024-TransNext.md ================================================ # TransNeXt核心模块详解 https://arxiv.org/pdf/2311.17132 ## 一、Aggregated Attention(聚合注意力) ### 1. 背景 #### 现有问题 - **深度退化效应**:许多高效ViT模型依赖堆叠层进行信息交换,但由于残差连接中的深度退化效应,无法形成充分的信息混合[1] - **与生物视觉的差异**:现有的局部注意力和空间下采样注意力与生物视觉系统工作原理存在显著差异[3] - **窗口分割artifacts**:基于窗口分割的方法会产生不自然的块状痕迹,即使经过深层堆叠也无法消除[3] - **计算复杂度**:全局自注意力的二次复杂度限制了在高分辨率图像上的应用[1] #### 生物视觉启发 人类视觉系统具有中央凹视觉(高敏锐度,覆盖1-2度视野)和周边视觉(大感受野但精度较低)的二分法特性。眼球通过快速运动(扫视)处理多个视野信息并进行整合[20]。 ### 2. 模块原理 #### 核心设计:像素聚焦注意力(Pixel-focused Attention) 采用**双路径设计**模拟生物视觉系统: **路径1:滑动窗口注意力**(模拟中央凹视觉) - 每个查询对其最近邻特征进行细粒度感知 - 使用固定的k×k窗口(实验中采用3×3)[5][6] **路径2:池化注意力**(模拟周边视觉) - 每个查询对空间下采样特征进行粗粒度全局感知 - 通过"激活和池化"操作获得全局信息[6] **数学表达**: ``` S(i,j)~ρ(i,j) = Q(i,j)K^T_ρ(i,j) # 滑动窗口路径 S(i,j)~σ(X) = Q(i,j)K^T_σ(X) # 池化路径 A(i,j) = softmax(Concat(S(i,j)~ρ(i,j), S(i,j)~σ(X))/√d + B(i,j)) ``` #### 增强机制 1. **查询嵌入(Query Embedding)**:添加可学习的查询令牌,增强注意力矩阵生成的多样性[7] 2. **位置注意力(Positional Attention)**:使用可学习令牌与查询交互,提供动态相对位置偏置[8] 3. **长度缩放余弦注意力**:提升多尺度输入的外推能力,λ = τ log N[9] ### 3. 解决的问题 1. **避免深度退化**:不依赖堆叠进行信息交换,单层即可实现有效的局部-全局建模[1] 2. **自然视觉感知**:消除窗口分割产生的不自然块状artifacts,实现更符合生物视觉的感知模式[3] 3. **像素级平移等变性**:模拟眼球连续运动,对图像任意位置的像素都能提供一致的中央凹视觉特性[3] 4. **线性复杂度**:当池化大小固定时,计算复杂度与输入序列长度呈线性关系[10] 5. **多尺度适应**:通过长度缩放余弦注意力和log-CPB位置偏置,提升大尺度图像的外推性能[9] --- ## 二、Convolutional GLU(卷积GLU) ### 1. 背景 #### ViT时代的通道注意力需求 - **SE机制的局限性**:在ViT时代,全局感受野不再稀缺,SE机制使用全局平均池化的方法显得过于粗粒度,所有令牌共享相同的门控信号[11] - **ViT缺乏通道注意力**:研究发现将SE机制引入通道混合器可以有效增强模型鲁棒性[11] - **位置信息需求**:ViT结构需要通过3×3深度卷积提供条件位置编码(CPE)[11] #### GLU的优势 门控线性单元(GLU)在自然语言处理任务中表现优于MLP,由两个线性投影组成,其中一个通过门控函数激活[11]。 ### 2. 模块原理 #### 设计理念 将**最小形式的3×3深度卷积**添加到GLU门控分支的激活函数之前,使其符合门控通道注意力的设计理念[11]。 #### 结构设计 ``` ConvGLU(X) = (XW1 + B1) ⊙ GELU(DWConv(XW2 + B2)) ``` 其中: - `XW1 + B1`:值分支(保持与MLP和GLU相同的深度) - `DWConv(XW2 + B2)`:门控分支(添加3×3深度卷积) - `⊙`:逐元素乘法 - `GELU`:激活函数 #### 关键特性 1. **基于最近邻特征的门控**:每个令牌拥有基于其最近邻细粒度特征的独特门控信号[12] 2. **反向传播友好**:值分支保持与MLP相同的深度[12] 3. **计算效率**:相比ConvFFN,在保持相同参数量的情况下,FLOPs更少[12] ### 3. 解决的问题 1. **细粒度通道注意力**:解决SE机制过于粗粒度的问题,每个令牌都有独特的门控信号[12] 2. **位置信息编码**:为没有位置编码设计的ViT模型提供必要的位置信息[11] 3. **增强鲁棒性**:通过基于局部特征的通道注意力机制,有效提升模型鲁棒性[11] 4. **计算效率优化**:实现注意力化的通道混合器,同时减少计算开销[12] 5. **满足ViT多样化需求**:简单而鲁棒的设计满足ViT的各种需求[12] #### 消融实验验证 在CIFAR-100上的实验表明,ConvGLU相比其他变体(Type-1、Type-2、Type-3)表现最佳,验证了将深度卷积放在门控分支激活函数前的设计合理性[27]。 --- ## 总结 Aggregated Attention和Convolutional GLU分别作为令牌混合器和通道混合器,共同构成了TransNeXt的核心。前者通过仿生视觉设计解决了深度退化和不自然视觉感知问题,后者通过改进的门控机制提升了通道建模能力和鲁棒性。两个模块的结合使TransNeXt在各种视觉任务上达到了最先进的性能[1][19]。 ================================================ FILE: module-info/CVPR2024-UniRepLKNet.md ================================================ # Dilated Reparam Block 模块总结 https://arxiv.org/pdf/2311.15599 ## 1. 背景 ### 传统大核设计的局限性 在UniRepLKNet之前,已有研究表明大核卷积应该与并行的小核卷积一起使用,因为小核有助于在训练过程中捕获小尺度模式[5]。传统做法是将大核和小核的输出通过各自的批归一化层后相加,训练后通过结构重参数化将小核等价合并到大核中以消除推理成本[5]。 ### 稀疏模式捕获的需求 作者观察到,除了小尺度模式外,增强大核捕获稀疏模式的能力(即特征图上的像素可能与一些远距离像素比其邻近像素更相关)可能产生更高质量的特征。这种需求恰好匹配膨胀卷积的机制——从滑动窗口的角度看,膨胀率为r的膨胀卷积扫描输入通道以捕获空间模式,其中每个关注像素与其邻居相距r-1个像素[5]。 ## 2. 模块原理 ### 核心设计思想 Dilated Reparam Block使用多个并行的膨胀小核卷积层来增强非膨胀大核卷积层的性能[5]。该模块的超参数包括: - 大核尺寸K - 并行卷积层的核尺寸k - 膨胀率r[5] ### 等价转换机制 **关键创新**:将膨胀卷积等价转换为非膨胀的稀疏大核[6]。 **转换原理**:忽略输入像素等价于在卷积核中插入额外的零元素,因此膨胀率为r、核尺寸为k的膨胀卷积层可以等价转换为核尺寸为(k-1)r+1的非膨胀层[5][6]。 **实现方法**:通过步长为r、恒等核I∈R^(1×1)的转置卷积优雅地实现转换[6]: ``` W' = conv_transpose2d(W, I, stride = r) ``` ### 具体实例 以K=9的示例为例,使用四个并行层,参数设置为k=(5,5,3,3),r=(1,2,3,4),等价核尺寸分别为(5,9,7,9)[6]。 对于默认设置K=13,使用五个层,参数为k=(5,7,3,3,3),r=(1,2,3,4,5),等价核尺寸为(5,13,7,9,11)[6]。 ### 推理时合并 推理时,首先将每个批归一化层合并到前面的卷积层中,然后使用转换函数将每个膨胀率r>1的层转换,最后通过适当的零填充将所有结果核相加[6]。 ## 3. 解决了什么问题 ### 1. 性能提升问题 **实验验证**:与使用相同数量并行分支的非膨胀变体相比,Dilated Reparam Block显著提升了性能。在ImageNet准确率和ADE20K mIoU上分别达到81.63±0.02和46.37±0.10,优于其他变体[9]。 ### 2. 稀疏模式捕获问题 **核心优势**:大核从并行膨胀卷积层捕获稀疏模式的能力中获益,而不仅仅是额外小核或不同感受野的组合[9]。这使得模型能够建立像素与远距离像素之间的长程依赖关系。 ### 3. 推理效率问题 **零额外成本**:通过等价转换,Dilated Reparam Block在推理时可以完全转换为单个大核卷积,实现训练时性能提升和推理时零额外计算成本的完美平衡[5][6]。 ### 4. 架构设计问题 **设计原则**:该模块体现了"大核应该看得广而不需要很深"的设计哲学,将传统ConvNet中扩大感受野、增加空间模式抽象层次和提升表征能力三个效果进行解耦[2][3]。 Dilated Reparam Block是UniRepLKNet架构设计的核心创新,它不仅解决了大核卷积的性能优化问题,更重要的是为大核ConvNet的架构设计提供了新的思路和方法。 ================================================ FILE: module-info/CVPR2025-BHViT.md ================================================ # BHViT: 二值化混合视觉Transformer论文总结 https://arxiv.org/pdf/2503.02394 ## 核心思想与主要贡献 本文提出了BHViT(Binarized Hybrid Vision Transformer),这是一种专门为二值化设计的混合视觉Transformer架构。研究发现,直接将现有的二值化CNN技术应用到ViT模型上会导致显著的性能下降,如图1所示,ReActNet在CNN架构上能达到73.3%的准确率,但在ViT架构上仅有49.5%[1]。 主要贡献包括: - 探索了当前二值化ViT模型性能严重下降的原因[1][2] - 提出了三个新颖模块构建高性能的二值化友好混合ViT框架[2] - 提出了基于量化分解(QD)的注意力矩阵二值化方案[2] - 设计了正则化损失来解决权重振荡与Adam优化器不兼容的问题[2] ## 方法架构 ### 1. 混合架构设计 BHViT采用四阶段金字塔结构,在不同阶段使用不同的token mixer[5]: - **前两个阶段**:使用多尺度分组空洞卷积模块(MSGDC)处理大空间分辨率特征[5] - **后两个阶段**:使用多尺度多头注意力模块(MSMHA)进行token级特征融合[5] ### 2. 关键技术模块 #### 多尺度分组空洞卷积(MSGDC) 使用三个不同空洞率的3×3分组卷积层,实现多尺度特征融合,显著减少模型参数和计算复杂度[6]。 #### 多尺度多头注意力(MSMHA) 基于窗口注意力机制的变体,通过7×7平均池化获得高尺度特征,同时将输入特征分割为7×7窗口版本,维持全局信息交互并降低计算成本[7]。 #### 量化分解(QD)注意力二值化 针对二值注意力矩阵无法准确表示不同token相似性差异的问题,提出了QD方法。使用全局缩放常数s=2^n-1,通过逻辑操作获得s个二值注意力矩阵[7][8]。 #### 二值化MLP增强 引入shift操作模块,包括水平、垂直和混合shift操作,减轻信息损失和梯度误差[9]。 ## 三个重要观察 ### 观察1:避免过多token有益于二值化ViT 通过理论分析证明,随着token数量k增加,注意力矩阵的信息熵会增加,概率分布逐渐接近均匀分布,削弱了注意力机制的有效性[6][23][24][25]。 ### 观察2:在每个二值化层添加残差连接有益 层级残差连接能有效缓解多个二值化层连续叠加导致的激活梯度消失问题[8][28][29]。 ### 观察3:Adam优化器放大了二值网络的权重振荡 在训练后期,Adam优化器会放大权重振荡,导致许多参数无法有效更新。为此提出L1正则化损失[10][30][31]。 ## 实验结果 ### 分类任务性能 在ImageNet-1K数据集上: - BHViT-Small†相比当前SOTA方法ReActNet提升20.6%[12] - 相比Swin transformer架构的BiViT方法提升11.5%[12] - 在CIFAR-10数据集上,BHViT-Small达到95.0%准确率[11] ### 分割任务性能 在道路分割任务中,BHViT在RS-LVF数据集上的mIoU达到85.1%,超越全精度ResNet-34的77.8%[13]。在ADE20K图像分割任务中也取得了SOTA性能[13]。 ## 消融研究 实验验证了各个提出模块的有效性[14]: - 移除正则化损失导致性能下降2.9% - 移除shift模块导致性能下降4.3% - 移除QD方法导致性能下降6.1% 权重分布分析显示,正则化损失能有效改变潜在权重分布,使其更接近±1,缓解权重振荡问题[15]。 ## 结论 BHViT成功解决了二值化ViT面临的关键挑战,通过混合架构设计、创新的注意力二值化方法和优化策略,在多个基准数据集上实现了SOTA性能,为在边缘设备上部署高效的视觉Transformer提供了有效解决方案[16]。 ================================================ FILE: module-info/CVPR2025-DarkIR.md ================================================ # DarkIR中EBlock和DBlock模块详细分析 https://arxiv.org/pdf/2412.13443 ## EBlock (编码器块) - 低光增强编码器 ### 1. 背景 在低光条件下,图像主要面临照明不足的问题。研究表明,低光条件与图像在频域中的幅度信息高度相关[4][5]。传统方法通常在空间域处理这些问题,但频域处理可以更有效地增强照明条件。 ### 2. 模块原理 EBlock基于Metaformer架构设计,包含两个核心组件[4][5]: **空间注意力模块 (SpAM)**: - 采用类似NAFBlock的结构,使用倒残差块和简化通道注意力(SCA) - 使用简单的门控机制替代激活函数 - 提取有意义的空间信息用于频域增强 **频域多层感知机 (Fre-MLP)**: - 应用快速傅里叶变换(FFT)将图像转换到频域 - **仅对幅度信息进行操作**,不触及相位信息 - 使用逆快速傅里叶变换(IFFT)转换回空间域 - 在幅度上操作的MLP比在空间域操作具有更好的效果 **下采样策略**: - 使用步长卷积进行下采样 - 每个层级后特征分辨率减半,允许在深层使用更多编码器块而不显著增加操作数 ### 3. 解决的问题 - **低光照明恢复**:通过频域幅度增强直接改善图像亮度[4][5] - **多尺度处理**:照明和幅度在不同尺度上保持一致性,可以在低分辨率估计后进行放大[5] - **计算效率**:频域处理的全局特性使得低光增强任务更加高效[4] - **中间监督**:产生低分辨率图像估计\(\hat{x}_{\downarrow 8}\),用于架构引导损失的正则化[4] --- ## DBlock (解码器块) - 去模糊解码器 ### 1. 背景 图像去模糊通常需要大感受野来处理各种类型的模糊核。传统方法要么通过深度特征提取和下采样实现,要么使用大核卷积,但后者会导致更高的计算复杂度和内存需求[4][6]。 ### 2. 模块原理 DBlock专注于空间变换,同样遵循Metaformer结构[6]: **扩张空间注意力模块 (Di-SpAM)**: - 受大核注意力(LKA)启发,但使用三个不同层级的特征 - 采用三个扩张深度卷积,扩张因子分别为1、4、9 - 将三个分支的属性组合,然后应用简化通道注意力 - 相比LKA性能更好且参数更少[10] **门控前馈网络 (Gated-FFN)**: - 使用简单门控机制替代激活函数 - 类似NAFNet的设计理念 **处理假设**: - 解码器输入是\(\hat{x}_{\downarrow 8}\)的深度表示 - 假设照明已被编码器校正,解码器专注于上采样和锐化[5][6] ### 3. 解决的问题 - **模糊去除**:通过大感受野空间注意力有效处理各种模糊类型[6] - **细节恢复**:在照明增强的基础上恢复图像锐度和细节 - **计算优化**:相比大核卷积方法,扩张卷积提供更好的效率/性能平衡[10] - **多尺度特征融合**:通过不同扩张因子捕获不同尺度的模糊信息[6] --- ## 模块协同工作机制 ### 任务分工 - **EBlock**:在低分辨率下处理照明问题,利用频域的全局特性[4][5] - **DBlock**:在高分辨率下处理模糊问题,利用空间域的局部特性[6] ### 信息传递 - 编码器提供照明增强的特征给解码器 - 通过中间输出\(\hat{x}_{\downarrow 8}\)进行架构引导[4] - 解码器专注于上采样和锐化已增强的低分辨率重建[5] ### 效率优势 这种非对称设计允许使用更少的块,显著减少参数数量和计算成本,同时保持最先进的性能[4][8]。 ================================================ FILE: module-info/CVPR2025-EVSSM.md ================================================ # EVS和EDFFN模块详细分析 https://arxiv.org/pdf/2405.14343 ## EVS(高效视觉扫描)模块 ### 1. 背景 传统的状态空间模型(如Mamba)是为处理一维序列数据而设计的,直接应用到视觉任务时需要将图像数据展平为一维序列,这会破坏图像的空间结构,难以捕获来自各种相邻像素的局部信息[2]。 现有的视觉状态空间模型大多采用多方向扫描机制来利用状态空间模型,但这种策略显著增加了计算成本。例如,VMamba的计算成本比Mamba高4倍,因为它在纵向和横向方向上执行双向扫描[5]。 ### 2. 模块原理 EVS模块的核心创新是**几何变换+单方向扫描**的策略[5]: **几何变换策略**: ``` G = { Transpose(Fin) if i % 2 = 0 Flip(Fin) if i % 2 = 1 } ``` 其中i是网络中第i个EVSS模块的索引,Flip操作沿特征的水平和垂直轴进行翻转[6]。 **扫描过程**: 1. 首先对输入特征应用几何变换 2. 通过1×1卷积分割特征为X1和X2 3. 对X1应用深度卷积和选择性扫描S6 4. 对X2应用激活函数 5. 最终通过1×1卷积融合结果[7] **空间结构恢复**:图像特征在每4个EVSS模块后自动恢复到原始空间结构,如果总模块数不能被4整除,可以通过相应的逆变换来恢复原始空间结构[6]。 ### 3. 解决的问题 - **空间信息丢失问题**:通过几何变换保持了图像的空间结构信息,避免了简单展平造成的信息损失[5] - **计算复杂度问题**:相比多方向扫描,EVS模块在保持相同参数量和FLOPs的情况下,运行时间从182.6ms降低到88.7ms[12] - **非局部信息探索**:通过不同的几何变换,每次扫描都能捕获来自不同方向的上下文信息,有效探索非局部信息[12] ## EDFFN(高效判别频域FFN)模块 ### 1. 背景 FFN部分通常是深度学习模型的核心组件,有助于潜在清晰图像的重建[7]。FFTformer开发了一种判别频域FFN(DFFN),能够自适应地确定应该保留哪些频率信息,但这在执行频域操作时增加了计算成本[7]。 ### 2. 模块原理 EDFFN的核心设计理念是**频域筛选后置**[7]: **与DFFN的区别**: - DFFN:在FFN网络的中间应用频域操作 - EDFFN:在FFN网络的最终阶段执行频域筛选[7] **模块结构**: 1. 输入特征经过归一化 2. 通过1×1卷积进行特征变换 3. 应用深度卷积和GELU激活 4. 在最终阶段进行频域筛选操作 5. 通过1×1卷积输出最终特征[4] ### 3. 解决的问题 - **计算效率问题**:通过将频域操作后置到FFN的最终阶段,相比在中间阶段进行频域操作的DFFN,显著降低了计算成本[7] - **特征变换效率**:有效且高效地变换来自EVSS模块的特征,为潜在清晰图像重建提供支持[7] - **频率信息选择**:保持了对有用频率信息的自适应选择能力,同时提高了计算效率[7] ## 模块协同效果 EVS和EDFFN模块的结合使得EVSSM能够: 1. **高效处理视觉数据**:EVS模块通过几何变换适配SSM到视觉任务 2. **有效特征变换**:EDFFN模块高效地处理和筛选频域特征 3. **整体性能提升**:两个模块协同工作,在GoPro数据集上相比基线方法PSNR提升0.14dB,同时保持相同的参数量和计算复杂度[12] ================================================ FILE: module-info/CVPR2025-EfficientViM.md ================================================ # EfficientViM模块详细分析 https://arxiv.org/pdf/2411.15241 ## 1. 背景 ### 现有技术挑战 - **传统CNN局限性**:卷积神经网络虽然在局部特征提取上表现良好,但在捕获全局依赖关系方面存在不足[1] - **Vision Transformer瓶颈**:自注意力机制具有二次计算复杂度O(L²D),在处理长序列时计算成本过高[1] - **状态空间模型机遇**:SSM提供了线性复杂度的全局交互能力,但现有SSD层存在计算瓶颈[2][3] ### SSD层的计算瓶颈 传统NC-SSD层的主要计算开销来自: - 输入序列的线性投影操作:O(LD²)[5] - 门控和输出投影:O(LD²)[5] - 总体复杂度被线性投影主导,限制了模型的可扩展性[5] ## 2. 模块原理 ### 2.1 隐藏状态混合器(HSM-SSD)核心思想 #### 计算重排策略 **关键洞察**:NC-SSD可以分解为两个步骤[5]: 1. 通过重要性权重a∈R^L对输入状态B^T_i x_i进行加权线性组合,获得共享全局隐藏状态h∈R^(N×D) 2. 通过相应的C∈R^(L×N)投影隐藏状态生成各输入的输出 #### 数学推导 原始操作:`h = (a1^T_N ⊙ B)^T(x_in W_in) = ((a1^T_N ⊙ B)^T x_in)W_in = h_in W_in`[5] 通过先计算h_in,将线性投影的复杂度从O(LD²)降低到O(ND²)[5][6] #### HSM近似 将原始输出:`x_out = f(y) = Linear(y ⊙ σ(z))` 近似为:`x_out = C((h ⊙ σ(h_in W_z))W_out) = Cf(h)`[6] ### 2.2 关键技术组件 #### 单头设计优化 - **问题**:多头配置中的内存绑定操作成为瓶颈,占用约1/4的总运行时间[8] - **解决方案**:采用单头设计,消除张量操作开销(reshape、copy等)[8] - **能力补偿**:通过状态级重要性权重A∈R^(L×N)模拟多头的多样化关系捕获能力[8] #### 多阶段隐藏状态融合(MSF) - **机制**:融合来自网络多个阶段的隐藏状态预测logits[7] - **计算过程**: 1. 对每阶段隐藏状态h^(s)计算全局表示:`ĥ^(s) = (1/N)∑h^(s)_i`[7] 2. 归一化并投影生成对应logits z^(s)[7] 3. 加权融合:`z = ∑β̂^(s)z^(s)`,其中β̂^(s)为可学习权重[7] ### 2.3 算法流程 ``` 输入: x_in ∈ R^(L×D) 1. B̂, C, Δ ← Linear(x_in) // O(LND) 2. B̂, C ← DWConv(B̂, C) // O(LNK²D) 3. A, B ← Discretization(â, B̂, Δ) // O(LD) 4. h_in ← (A ⊙ B)^T x_in // O(LND) 5. h, z ← Linear(h_in) // O(ND²) 6. h ← Linear(h ⊙ σ(z)) // O(ND²) 7. x_out ← Ch // O(LND) ``` ## 3. 解决了什么问题 ### 3.1 计算效率问题 - **复杂度优化**:将SSD层复杂度从O(LD²)降低到O(ND² + LND),当N≪L时显著减少计算量[5][6] - **实际加速**:相比传统方法实现显著的吞吐量提升,EfficientViM-M2达到17,005 img/s[10] ### 3.2 内存效率问题 - **内存绑定操作优化**:通过单头设计消除多头配置中的内存访问瓶颈[8] - **实际内存使用**:尽管参数较多,但峰值内存使用量仅为某些轻量级模型的1/3[16] ### 3.3 速度-准确率权衡问题 - **SOTA性能**:在ImageNet-1K上建立新的速度-准确率权衡最优前沿[1][10] - **具体提升**:相比SHViT提升0.6%性能的同时实现7%的速度提升[3] - **相比传统模型**:相比MobileNetV3性能提升0.6%,速度提升80%[3] ### 3.4 可扩展性问题 - **高分辨率适应性**:在极高分辨率图像处理中展现出色的扩展能力[20][21] - **多任务适用性**:在目标检测、实例分割、语义分割等密集预测任务上均表现优异[14][15][16] ### 3.5 实际部署问题 - **硬件友好**:优先考虑实际运行性能而非理论FLOPs,更适合实际部署[3] - **跨设备性能**:在GPU、CPU和移动设备上均保持竞争力[23][24] 通过这些创新设计,EfficientViM成功解决了现有视觉模型在效率、可扩展性和实际部署方面的关键挑战,为资源受限环境下的视觉任务提供了高效解决方案。 ================================================ FILE: module-info/CVPR2025-FDConv.md ================================================ # FDConv模块详细总结 https://arxiv.org/pdf/2503.18783 ## 1. 背景 ### 传统动态卷积的发展与局限 - **动态卷积(DY-Conv)** 通过使用多个并行权重结合注意力机制,实现了样本特定的权重自适应,相比标准卷积具有更好的适应性[1][6]。 - **主要问题**: - 传统动态卷积方法(如ODConv、CondConv等)的并行权重在频率响应上高度相似,缺乏多样性[1][2]。 - 参数成本大幅增加(通常增加n倍,n<10),但适应性提升有限[3]。 - 权重之间的余弦相似度高达0.88以上,表明存在严重的参数冗余[13]。 ## 2. 模块原理 ### FDConv的三个核心组件 #### 2.1 傅里叶不相交权重(FDW)[7][8] - **核心思想**:在傅里叶域而非空间域学习频谱系数 - **实现步骤**: 1. **傅里叶不相交分组**:将固定数量的参数按频率从低到高排序,均匀分成n个不相交的组 2. **傅里叶到空间变换**:使用逆离散傅里叶变换(iDFT)将每组参数转换到空间域 3. **重组**:将变换结果裁剪成k×k的块并重组成标准权重形状 #### 2.2 核空间调制(KSM)[8][9] - **目的**:实现滤波器级别的精细调制 - **结构**: - **局部通道分支**:使用轻量级1D卷积捕获局部通道信息,预测密集调制矩阵 - **全局通道分支**:使用全连接层获取全局通道信息,预测三个维度的调制值 - **输出**:生成k×k×Cin×Cout的密集调制矩阵α #### 2.3 频率带调制(FBM)[9][10] - **功能**:实现空间变化的频率调制 - **工作流程**: 1. **核频率分解**:将卷积权重分解为不同频率带(默认4个频带) 2. **傅里叶域卷积**:在频率域执行卷积操作 3. **空间变化调制**:为每个空间位置的每个频率带预测调制值 ## 3. 解决了什么问题 ### 3.1 频率多样性问题 - **问题**:传统动态卷积的并行权重频率响应高度相似[1][2] - **解决方案**:FDW通过不相交的傅里叶索引分组,确保每个权重具有独特的频率响应[3][7] - **效果**:权重之间的余弦相似度降为0,实现真正的频率多样性[13] ### 3.2 参数效率问题 - **问题**:传统方法参数成本增加n倍(如CondConv +90M,ODConv +65.1M)[11][12] - **解决方案**:FDConv保持固定参数预算,通过傅里叶域分组可生成大量(n>10)多样化权重[3] - **效果**:仅增加3.6M参数即可达到优异性能[11] ### 3.3 空间不变性问题 - **问题**:传统动态卷积在整个特征图上共享权重,无法适应空间变化的内容[9] - **解决方案**:FBM实现空间特定的频率调制,可根据局部内容动态调整频率响应[9][10] - **效果**:能够在不同空间位置选择性地强调或抑制特定频率带,更好地捕获图像中的复杂结构[15] ================================================ FILE: module-info/CVPR2025-GroupMamba.md ================================================ # GroupMamba Layer模块详细总结 https://arxiv.org/pdf/2407.13772 ## 1. 背景 ### 现有问题 传统的Mamba模型在计算机视觉任务中面临几个关键挑战: **稳定性问题**: - Mamba模型,特别是S6算法,在图像分类任务中存在不稳定性,尤其是扩展到大型模型时[2][4] - 例如SiMBA-L (MLP)模型会导致次优的分类结果,准确率仅为49%[4] **计算效率问题**: - 视觉状态空间(VSS)块包含大量的输入输出投影和深度卷积,其参数和计算复杂度与输入通道数成正比[2] - Mamba设计在处理大量通道时计算效率低下[4][6] **交互局限性**: - 现有模型在处理空间依赖关系和全局-局部信息建模方面存在不足[2] ## 2. 模块原理 ### 整体架构 GroupMamba Layer采用模块化设计,主要包含三个核心组件[5][6]: ``` Xout = Xin + FFN(LN(XCAM)) 其中: XGM = GroupedMamba(Xin, Θ) XCAM = CAM(XGM, Affinity(Xin)) ``` ### 核心组件详解 #### 2.1 分组Mamba算子(Grouped Mamba Operator) **设计思路**: - 受组卷积启发,将输入通道分为四个组,每组大小为C/4[6] - 每个组独立应用VSSS块,在不同空间方向进行扫描[6] **四方向扫描策略**: - 从左到右(Left-to-Right) - 从右到左(Right-to-Left) - 从上到下(Top-to-Bottom) - 从下到上(Bottom-to-Top)[6][7] **数学表达**: ``` XGM = GroupedMamba(Xin, Θ) = Concat[ VSSS(XLR, ΘLR), VSSS(XRL, ΘRL), VSSS(XTB, ΘTB), VSSS(XBT, ΘBT) ] ``` 其中每个方向的输入张量形状为(B, H, W, C/4)[7] #### 2.2 视觉单选择扫描(VSSS)块 **功能**:作为令牌和通道混合器,基于Mamba算子构建[6] **结构**: ``` Z'out = Zin + Mamba(LN(Zin)) Zout = Z'out + FFN(LN(Z'out)) ``` 包含Mamba块和前馈网络,每个前面都有LayerNorm[6] #### 2.3 通道亲和力调制(CAM)算子 **设计目的**:解决分组操作导致的跨通道信息交换受限问题[7] **工作流程**: 1. **通道统计计算**: ``` ChannelStat(Xin) = AvgPool(Xin) ``` 2. **亲和力计算**: ``` Affinity(Xin) = σ(W2δ(W1ChannelStat(Xin))) ``` 3. **特征重新校准**: ``` XCAM = XGM · Affinity(Xin) ``` **与SE块的区别**: - CAM专门针对多组变换中的跨通道注意力设计 - 允许组间信息交换,克服分组Mamba算子的固有限制[7][8] ## 3. 解决的关键问题 ### 3.1 计算效率问题 **解决方案**: - 通过将通道分为四组,显著减少了参数数量和计算复杂度[6] - 相比VMamba-T,GroupMamba-T参数减少26%,吞吐量提升2.5倍[12] **效果**: - GroupMamba-T:2300万参数,相比传统方法参数效率提升显著[9] ### 3.2 稳定性问题 **解决方案**: - 引入基于蒸馏的训练目标,稳定大型模型训练[8] - 联合损失函数:`Ltotal = αLCE(Zs, y) + (1-α)LCE(Zs, yt)`[8] **效果**: - 大型模型训练更加稳定,损失收敛更平滑[20][21] - GroupMamba-B通过蒸馏损失准确率提升1.3%[20] ### 3.3 空间建模局限性 **解决方案**: - 四方向扫描策略提供全面的空间覆盖[6][7] - 有效建模局部和全局信息的空间依赖关系[2] **效果**: - 四个扫描方向相比单一方向能捕获更丰富的空间线索[20] - 在ImageNet-1K上达到state-of-the-art性能[9] ### 3.4 通道交互问题 **解决方案**: - CAM算子增强跨通道通信,补偿分组操作的局限性[7] - 通过通道重新校准提升网络表示能力[7] **效果**: - CAM模块使准确率从82.20%提升到82.50%[12] - 有效解决了分组操作带来的信息交换受限问题[7][8] ## 总结 GroupMamba Layer通过创新的分组设计、多方向扫描和通道调制机制,成功解决了传统Mamba模型在视觉任务中的效率、稳定性和交互性问题,为构建高效的视觉状态空间模型提供了新的解决方案[1][2]。 ================================================ FILE: module-info/CVPR2025-LSNet.md ================================================ # LSNet中的LS Block模块总结 https://arxiv.org/pdf/2503.23135 ## 1. 背景 ### 传统轻量级网络的局限性 现有轻量级视觉网络主要依赖两种token混合方式: - **自注意力机制**:采用全局感知和全局聚合,但在信息量较少的区域(如背景)会产生冗余注意力,且感知和聚合使用相同的混合范围,扩展上下文时计算复杂度显著增加[1][2] - **卷积操作**:使用相对位置关系进行感知,通过固定核权重进行聚合,但关系建模仅依赖相对位置,对不同上下文缺乏适应性,表达能力受限[2][6][7] ### 人类视觉系统的启发 人类视觉系统具有动态异尺度视觉能力,遵循双步机制: - **周边视觉**:通过大视野感知捕获场景的广泛概览("看大") - **中央视觉**:通过小视野聚合实现对特定元素的详细理解("聚小") 这种机制源于视网膜中两种感光细胞的不同分布和功能:杆状细胞广泛分布于周边区域负责大视野感知,锥状细胞集中在中央凹负责精细聚焦[3]。 ## 2. 模块原理 ### LS卷积的核心设计 LS Block的核心是LS(Large-Small)卷积,包含两个关键步骤: #### 大核感知(Large-Kernel Perception, LKP) - 采用大核瓶颈块设计 - 首先使用1×1卷积将通道维度降至C/2以减少计算成本 - 然后使用KL×KL的大核深度卷积高效捕获大视野空间上下文信息 - 最后通过1×1卷积生成上下文自适应权重W∈R^(H×W×D)用于聚合步骤[7][8] 数学表达: ``` wi = Pls(xi, NKL(xi)) = PW(DWKL×KL(PW(NKL(xi)))) ``` #### 小核聚合(Small-Kernel Aggregation, SKA) - 采用分组动态卷积设计 - 将特征图通道分为G组,每组包含C/G个通道,同组内共享聚合权重以降低内存开销 - 将LKP生成的权重wi重塑为w*i∈R^(G×KS×KS) - 使用w*i对高度相关的KS×KS邻域进行自适应聚合[8] 数学表达: ``` yic = Als(w*ig, NKS(xic)) = w*ig ⊛ NKS(xic) ``` ### LS Block的完整结构 LS Block基于LS卷积构建,包含以下组件: - **LS卷积**:执行有效的token混合 - **跳跃连接**:促进模型优化 - **额外的深度卷积和SE层**:通过引入更多局部归纳偏置增强模型能力 - **前馈网络(FFN)**:用于通道混合[9] ## 3. 解决的问题 ### 3.1 计算效率问题 **问题**:传统自注意力机制在扩展感知范围时计算复杂度急剧增加 **解决方案**: - 通过异尺度设计,大核感知使用高效的深度卷积,小核聚合限制在小区域 - 总计算复杂度为O(HWC/4(3C + 2K²L + (2G + 4)K²S)),相对输入分辨率呈线性关系[8] - 实验显示LS卷积相比其他方法在更低FLOPs下获得更高准确率[17] ### 3.2 表达能力限制问题 **问题**:传统卷积的聚合权重由固定核权重决定,缺乏对不同上下文的适应性 **解决方案**: - LKP通过大核感知建模丰富的空间关系 - SKA基于感知结果进行动态自适应聚合 - 消融实验显示相比简单的大小核组合,LS卷积提升1.5%准确率[17] ### 3.3 感知范围与聚合精度的平衡问题 **问题**:现有方法难以在有限计算预算下同时实现广泛感知和精确聚合 **解决方案**: - "看大聚小"策略:大范围感知捕获全局上下文,小范围聚合实现精确特征融合 - 可视化分析显示LS卷积同时具备中央区域聚焦和广泛周边视野能力[33] - 聚合权重可视化表明能够准确强化语义相关区域[35] ### 3.4 轻量级网络的性能瓶颈 **问题**:轻量级网络在有限计算资源下难以获得足够的表达能力 **解决方案**: - 通过生物启发的设计提高特征表达效率 - 在ImageNet-1K上,LSNet-T仅用0.31G FLOPs达到74.9%准确率,显著超越同等计算量的其他方法[11] - 在多个下游任务中均表现出色,证明了良好的迁移能力[12][14][15] LS Block通过巧妙结合大核感知和小核聚合,成功解决了轻量级网络在效率、表达能力和感知精度方面的关键挑战,为轻量级视觉网络设计提供了新的解决思路。 ================================================ FILE: module-info/CVPR2025-MambaIRV2.md ================================================ # Attentive State Space Group (ASSG) 模块总结 https://arxiv.org/pdf/2411.15269 ## 1. 背景 ### 问题背景 传统Mamba架构在图像修复任务中面临的核心挑战: - **因果建模限制**:Mamba的状态空间方程具有因果性质,每个像素只能依赖于扫描序列中的前序像素,无法全局利用相似像素[1][2] - **局部-全局建模需求**:图像修复任务既需要捕获局部细节特征,也需要全局上下文信息进行有效修复[9] - **计算效率要求**:需要在保证性能的同时控制计算复杂度,特别是对于高分辨率图像[9] ### 设计动机 基于对注意力机制与状态空间模型数学联系的深入分析,发现可以通过修改状态空间方程的输出矩阵C来实现类似注意力的非因果查询能力[6]。同时,考虑到图像修复任务的层次化特性,需要设计能够同时处理局部和全局信息的模块架构[9]。 ## 2. 模块原理 ### 整体架构设计 ASSG采用分层处理策略,包含多个Attentive State Space Block (ASSB),每个ASSB实现渐进式局部到全局建模[9]: ``` ASSG = {ASSB₁, ASSB₂, ..., ASSBₙ} ``` ### ASSB内部结构 每个ASSB采用统一的模板设计[9]: - **Norm + Token Mixer + Norm + FFN**的基本结构 - **双重Token Mixer**: - 局部部分:窗口多头自注意力(Window MHSA)处理局部交互 - 全局部分:注意力状态空间模块(ASSM)处理全局依赖 - **残差连接**:引入可学习缩放因子的残差连接[9] ### 核心组件协同 1. **Window MHSA**:负责窗口内的局部特征交互,利用自注意力机制捕获精细的局部结构信息[9] 2. **ASSM (Attentive State Space Module)**: - 包含ASE (Attentive State-space Equation)和SGN (Semantic Guided Neighboring) - 通过单次语义空间扫描实现全局建模[7][8][9] 3. **分层信息融合**:通过多个ASSG的堆叠,形成从浅层到深层的特征层次[9] ## 3. 解决了什么问题 ### 3.1 局部-全局建模平衡 **问题**:传统方法要么局限于局部感受野(CNN),要么计算复杂度过高(全局注意力) **解决方案**: - 通过Window MHSA高效处理局部交互 - 通过ASSM实现计算友好的全局建模 - 渐进式设计确保信息从局部到全局的有效传递[9] ### 3.2 计算效率优化 **问题**:多方向扫描导致计算冗余,参数利用效率低 **解决方案**: - 单次扫描策略:相比传统4方向扫描减少43%参数和50%计算负担[19] - 参数预算重分配:将节省的参数用于增强局部建模能力(Window MHSA)[9] ### 3.3 特征表示能力增强 **问题**:Mamba的因果性限制了对图像全局信息的利用 **解决方案**: - ASE通过提示学习机制实现非因果查询,使模型能够"看到"未扫描的像素[7][8] - SGN通过语义重排缓解长距离衰减问题[9] - 局部-全局协同建模提升整体特征表示能力 ### 3.4 架构通用性 **问题**:需要一个能够适应多种图像修复任务的通用骨干网络 **解决方案**: - 模块化设计支持不同任务的灵活配置 - 在超分辨率、去噪、JPEG压缩伪影去除等多个任务上均取得优异性能[11][13][14][16][18][19] - 提供Small、Base、Large三种规模变体满足不同应用需求[10] ### 性能验证 实验结果表明ASSG设计的有效性: - **消融研究**:移除ASSM后性能显著下降,验证了全局建模的重要性[10] - **效率对比**:相比HAT等方法在保持性能的同时显著降低计算复杂度[16][17] - **泛化能力**:在多个数据集和任务上均表现出色,证明了架构的通用性[11][13][14][16][18][19] ASSG模块通过巧妙的局部-全局协同设计,成功解决了Mamba在图像修复任务中的关键限制,为状态空间模型在计算机视觉领域的应用提供了重要突破[9]。 ================================================ FILE: module-info/CVPR2025-MambaOut.md ================================================ # Gated CNN Block 模块总结 https://arxiv.org/pdf/2405.07992 ## 1. 背景 ### 历史发展背景 Gated CNN block最初由Dauphin等人在2017年提出,用于语言建模任务[18]。在本文中,作者发现**Mamba block实际上是基于Gated CNN block构建的**[9][10]。 ### 与Mamba的关系 通过对比分析发现,**Mamba block和Gated CNN block的主要区别仅在于是否包含SSM(状态空间模型)组件**[1][9]: - **Gated CNN block**: `TokenMixer(Z) = Conv(Z)`[10] - **Mamba block**: `TokenMixer(Z) = SSM(σ(Conv(Z)))`[10] 这一发现促使作者构建MambaOut模型来验证SSM在视觉任务中的必要性[9]。 ## 2. 模块原理 ### 整体架构 Gated CNN block采用了MetaFormer的元架构设计[9],其数学表达式为: ``` X' = Norm(X) [9] Y = (TokenMixer(X'W₁) ⊙ σ(X'W₂))W₃ + X [9] ``` ### 核心组件设计 **Token Mixer设计**[10]: - 使用**7×7深度卷积**作为token mixer,遵循ConvNeXt的设计 - 采用**部分通道卷积**策略,仅对部分通道进行深度卷积以提升实际运行速度 **门控机制**[10]: - 输入通过`fc1`线性层分为三个部分:`g`(门控)、`i`(信息)、`c`(卷积) - 门控部分`g`经过激活函数后与其他部分相乘,实现选择性信息传递 - 公式:`output = fc2(act(g) * cat(i, conv(c)))` ### 具体实现细节 根据Algorithm 1的PyTorch代码[10]: - **扩展比例**:默认为8/3 - **卷积核大小**:7×7 - **分组卷积**:使用深度可分离卷积 - **残差连接**:包含shortcut连接确保梯度流动 ## 3. 解决了什么问题 ### 计算效率问题 **线性复杂度优势**[4][5]: - 相比于注意力机制的二次复杂度,卷积操作提供了更高的计算效率 - 特别适合处理不需要全局信息交互的任务 ### 特征选择问题 **门控机制的优势**[10]: - 通过门控单元实现**选择性特征传递** - 允许模型自适应地决定哪些信息应该被保留或抑制 - 提供了比普通卷积更强的表达能力 ### 架构简化问题 **奥卡姆剃刀原理**[14]: - 对于不需要复杂序列建模的视觉任务,**Gated CNN提供了更简洁有效的解决方案** - 实验证明,在ImageNet图像分类任务中,去除SSM的MambaOut模型反而表现更好 ### 实际应用问题 **工程实现优势**[10]: - 代码实现**简单优雅** - 相比复杂的SSM机制,更容易理解和调试 - 在不需要长序列建模的场景下,提供了更好的性能-复杂度权衡 ## 核心洞察 Gated CNN block的成功说明了一个重要原则:**架构设计应该与任务特征相匹配**[2]。对于图像分类这类不需要长序列和自回归特征的任务,简单的门控卷积架构就足够了,而不需要引入额外的SSM复杂性[3][14]。 这为未来的模型设计提供了重要启示:**并非所有任务都需要最新最复杂的架构,有时候更简单的解决方案反而更有效**。 ================================================ FILE: module-info/CVPR2025-MambaVision.md ================================================ # MambaVision Mixer模块总结 https://arxiv.org/pdf/2407.08083 ## 1. 背景 ### 原始Mamba在视觉任务中的局限性 传统Mamba架构虽然在自然语言处理任务中表现出色,但在计算机视觉应用中面临显著挑战[2][3]: - **顺序处理限制**:Mamba的自回归特性适合序列数据处理,但图像像素不具有严格的顺序依赖关系,空间关系更多是局部的,需要并行和集成的处理方式[2] - **全局上下文捕获不足**:自回归模型逐步处理数据,限制了在单次前向传播中捕获和利用全局上下文的能力[3] - **因果卷积的方向性限制**:原始Mamba使用因果卷积,限制了影响范围到单一方向,这对视觉任务来说是不必要且具有限制性的[8] ### 现有解决方案的不足 虽然Vision Mamba (Vim)等方法提出了双向SSM来解决全局上下文缺失问题,但这些方法引入了显著的延迟,因为需要在做出预测前处理整个序列,增加的复杂性还可能导致训练困难和过拟合风险[3]。 ## 2. 模块原理 ### 核心设计思想 MambaVision Mixer通过创建**对称双分支架构**来重新设计原始Mamba块,如图3所示[8][9]: ### 具体架构组成 #### 分支1:改进的SSM分支 ``` X1 = Scan(σ(Conv(Linear(C, C/2)(Xin)))) ``` - 将原始的**因果卷积替换为常规卷积**,消除单向限制[8] - 保留选择性扫描(Scan)操作进行序列建模[9] - 使用SiLU激活函数[9] #### 分支2:对称非SSM分支 ``` X2 = σ(Conv(Linear(C, C/2)(Xin))) ``` - **不包含SSM操作**的纯卷积分支[8] - 使用相同的卷积和SiLU激活配置[9] - 作为补偿路径处理可能因SSM顺序约束丢失的内容[8] #### 特征融合 ``` Xout = Linear(C/2, C)(Concat(X1, X2)) ``` - 将两个分支输出**连接(Concat)**而非相加[9] - 通过最终线性层投影回原始嵌入维度[9] - 每个分支输出维度为C/2,保持参数量与原始设计相似[9] ### 算法实现 论文提供了PyTorch风格的伪代码实现[7],展示了完整的前向传播过程,包括: - 输入投影和维度分割 - 双分支并行处理 - 选择性扫描操作 - 特征连接和输出投影 ## 3. 解决了什么问题 ### 3.1 空间信息处理效率问题 **问题**:原始Mamba的因果卷积限制了空间信息的双向流动[8] **解决方案**:使用常规卷积替代因果卷积,允许特征在所有空间方向上自由传播,更适合处理图像的二维空间结构[8] ### 3.2 信息丢失补偿问题 **问题**:SSM的顺序约束可能导致重要空间信息的丢失[8] **解决方案**:引入对称的非SSM分支作为"安全网",确保即使SSM分支丢失某些信息,也能通过纯卷积路径得到补偿[8] ### 3.3 全局与局部特征平衡问题 **问题**:需要同时捕获序列依赖和空间上下文信息[9] **解决方案**:双分支设计使最终特征表示能够融合序列信息(来自SSM分支)和空间信息(来自卷积分支),充分利用两种处理方式的优势[9] ### 3.4 性能验证结果 通过系统性消融研究验证了设计有效性[14]: | 配置 | ImageNet Top-1 | COCO AP_box | COCO AP_mask | ADE20K mIoU | |------|----------------|-------------|--------------|-------------| | 原始Mamba (因果conv1, 无conv2) | 80.9% | 44.8 | 40.2 | 44.2% | | 常规conv1, 无conv2 | 80.9% | 45.0 | 40.8 | 44.7% | | conv1 + conv2, 无连接 | 81.3% | 45.3 | 41.0 | 45.7% | | **完整MambaVision Mixer** | **82.3%** | **46.4** | **41.8** | **46.0%** | 最终的连接操作带来了显著提升:ImageNet Top-1准确率+1.0%,COCO box AP +1.1,mask AP +0.8,ADE20K mIoU +0.9[14]。 这些结果验证了MambaVision Mixer通过双分支架构和特征连接,成功解决了原始Mamba在视觉任务中的核心局限性,实现了更丰富的特征表示、更好的泛化能力和改进的计算机视觉任务性能[9]。 ================================================ FILE: module-info/CVPR2025-MobileMamba.md ================================================ # MobileMamba模块详细分析 https://arxiv.org/pdf/2411.15941 ## 1. 背景 ### 现有方法的局限性 - **CNN模型局限**:基于CNN的轻量级模型(如MobileNets)主要使用局部感受野,难以捕获长距离依赖关系,在高分辨率下游任务中性能受限[1][4] - **Transformer复杂度问题**:Vision Transformers虽然具有全局感受野和长距离建模能力,但存在二次计算复杂度,在高分辨率场景下计算开销较高[1][3] - **现有Mamba模型不足**:尽管状态空间模型具有线性计算复杂度优势,但当前轻量级Mamba模型存在推理速度慢、性能不佳的问题[3] ### 设计动机 研究发现现有Mamba结构虽然FLOPs较低,但实际推理速度较慢,性能表现不理想[3]。因此需要设计一个既能保持Mamba线性复杂度优势,又能显著提升推理速度和性能的新框架。 ## 2. 模块原理 ### 整体架构设计 MobileMamba采用**三阶段网络架构**替代传统四阶段设计[6]。三阶段网络在第一次下采样时将输入图像降至H/16×W/16×C1,最终输出H/64×W/64×C4,相比四阶段网络减少计算量并提升推理速度[6]。 ### 核心模块:多感受野特征交互(MRFFI) MRFFI模块是MobileMamba的核心创新,将输入特征沿通道维度分为三个部分进行并行处理[7]: #### 2.1 长距离小波变换增强Mamba (WTE-Mamba) **功能**:在全局建模基础上增强高频边缘细节提取能力[7] **实现原理**: - 对输入特征的第一部分 \[x_{IG} \in \mathbb{R}^{h×w×ξc}\] 通过双向扫描Mamba模块学习全局信息[7] - 同时对相同特征图进行Haar小波变换,获得不同频率尺度的特征表示 \[x_{Iw} \in \mathbb{R}^{h/2×w/2×4ξc}\][7] - 通过局部卷积信息提取和逆小波变换恢复原始特征图尺寸[7] **数学表达**: ``` x_{Im1} = SSM(σ(Conv(Linear(x_{IG}[:ξc])))) x_{Im2} = σ(Linear(x_{IG}[ξc:])) x_{Om} = Linear(x_{Im1} ⊗ x_{Im2}) ``` 小波变换部分: ``` x_{Iwt} = WT(x_{Iw}) = [f_{LL}, f_{LH}, f_{HL}, f_{HH}] x_{Ow} = IWT(Conv(x_{Iwt})) ``` 最终输出:\[x_{OG} = x_{Om} + x_{Ow}\][7] #### 2.2 高效多核深度卷积 (MK-DeConv) **功能**:提取具有不同感受野的局部信息,实现多感受野交互[8] **实现原理**: - 将剩余特征 \[x_{IL} \in \mathbb{R}^{h×w×μc}\] 分为n个部分[8] - 每部分使用不同核大小的局部卷积操作:\[x_{OLj} = Conv(x_{ILj}), k = (2j+1), j \in \{1,...,n\}\][8] - 将不同卷积操作结果连接形成输出特征:\[x_{OL} = Concat([x_{OL1},...,x_{OLn}], dim=-1)\][8] #### 2.3 消除冗余恒等映射 **功能**:减少高维空间中的特征冗余,降低计算复杂度,提升处理速度[8][9] **实现**:对剩余 \[(1-ξ-μ)c\] 个通道应用恒等映射,避免不必要的计算[9] **最终输出**: ``` x_O = Concat(x_{OG}, x_{OL}, x_I[(1-ξ-μ)c:]) ``` ### 训练与测试优化策略 - **知识蒸馏**:使用TResNet-L作为教师模型进行软蒸馏[10] - **扩展训练**:从300轮扩展到1000轮训练[10] - **归一化层融合**:测试时融合批归一化层提升推理速度[10] ## 3. 解决的关键问题 ### 3.1 推理速度问题 **问题**:现有Mamba模型虽然FLOPs较低,但实际推理速度慢[3] **解决方案**: - 采用三阶段架构减少计算量[6] - 通过恒等映射消除冗余计算[9] - 归一化层融合提升推理效率[10] **效果**:相比LocalVim速度提升21倍,相比EfficientVMamba速度提升3.3倍[3] ### 3.2 感受野局限问题 **问题**:单一架构难以同时获得全局和多尺度局部感受野[1] **解决方案**: - WTE-Mamba提供全局感受野和高频细节提取[7] - MK-DeConv提供多尺度局部感受野[8] - 小波变换有效扩大感受野范围[7] **效果**:实现了全局ERF,同时通过多核局部卷积增强邻近信息提取[3] ### 3.3 性能与效率平衡问题 **问题**:现有方法难以在保持高性能的同时实现高效率[3] **解决方案**: - 精心设计的通道分配策略(ξ和μ比例)[9] - 渐进式架构优化[17] - 多种训练策略协同作用[10] **效果**:在ImageNet-1K上达到83.6% Top-1准确率,同时保持高推理速度[12][13] ### 3.4 高分辨率任务适应性问题 **问题**:轻量级模型在高分辨率下游任务中性能不佳[4] **解决方案**: - 线性计算复杂度保证高分辨率处理效率[3] - 多感受野设计增强细节捕获能力[7][8] - 针对不同任务的预训练策略[32] **效果**:在目标检测、实例分割、语义分割等高分辨率任务中均取得显著提升[14][15][16] 通过这些创新设计,MobileMamba成功解决了现有轻量级视觉模型在推理速度、感受野覆盖、性能效率平衡等方面的关键问题,为轻量级视觉模型设计提供了新的解决方案。 ================================================ FILE: module-info/CVPR2025-Mona.md ================================================ # Mona模块详细分析 https://arxiv.org/pdf/2408.08345 ## 1. 背景 ### 传统适配器的局限性 - **来源局限**:现有的计算机视觉适配器设计主要沿用NLP领域的线性适配器结构,使用线性滤波器(主要包括下投影、非线性激活、上投影和跳跃连接)[3][5] - **信号处理差异**:视觉信号与语言信号存在显著差异,具有独特的2D卷积操作特性,而传统线性适配器并非为视觉信号优化[3][5] - **认知维度单一**:大多数现有适配器使用单一线性层压缩上游特征,缺乏多尺度认知能力[3] ### 增量调优的困境 - **性能瓶颈**:现有的视觉增量调优方法无法在具有挑战性的任务(如目标检测和分割)上超越全量微调的上限[1][3] - **参数固定问题**:适配器调优中固定层参数无法微调以匹配新任务的数据分布,导致传递给适配器的特征分布存在偏差[5] ## 2. 模块原理 ### 整体架构 Mona模块被插入到每个SwinTransformer块的MSA(多头自注意力)和MLP(多层感知器)之后,固定预训练层参数,只更新Mona中的参数[5]。 ### 核心组件 #### 2.1 输入优化机制 **缩放归一化层**: - 添加LayerNorm层和两个可学习权重s1、s2来调整输入分布[5] - 公式表示:`xnorm = s1 · |x0|LN + s2 · x0`[5] - **作用**:使适配器能够调整输入分布和来自固定层的输入比例[5] #### 2.2 多认知视觉滤波器 **多尺度卷积结构**: - 使用三个不同尺寸的深度可分离卷积(DWConv):3×3、5×5、7×7[6] - **设计灵感**:模拟人眼从不同尺度处理视觉信号并整合以获得更好理解的认知过程[5][6] - **参数效率**:采用深度可分离卷积而非标准卷积,最小化额外参数量[6] **特征聚合机制**: - 计算三个滤波器的平均结果[6] - 使用1×1卷积聚合特征[6] - 公式表示:`fdw = x + avg(∑³ᵢ₌₁ ωⁱdw ⊗̂ x)`[6] #### 2.3 跳跃连接 - 在两种卷积类型中都添加跳跃连接,增强适配能力[6] - 点卷积步骤:`fpw = x + ωpw ⊗ x`[6] #### 2.4 完整计算流程 整个Mona的计算过程可表示为: `x = x0 + Ulσ(fpw(fdw(Dl(xnorm))))`[6] 其中Dl和Ul分别表示第l个适配器的下投影和上投影,σ表示GeLU激活函数[6]。 ### 参数分析 每个Mona模块的参数包括: - LayerNorm和缩放因子:2m + 2 - 两个线性层:2mn + m + n - DWConv层:83n(来自3² + 5² + 7² = 83) - 点卷积:n² - **总参数量**:`(2n + 3)m + n² + 84n + 2`[7] ## 3. 解决的关键问题 ### 3.1 视觉信号处理不匹配问题 **问题**:传统线性适配器主要为语言信号设计,不适合处理具有2D空间特性的视觉信号[3][5] **解决方案**: - 引入视觉友好的卷积滤波器替代线性滤波器[5] - 实验证明卷积滤波器能更好地将视觉知识从预训练模型迁移到其他任务[3] ### 3.2 输入分布偏差问题 **问题**:固定层参数无法微调以匹配新任务数据分布,导致传递给适配器的特征分布存在偏差[5] **解决方案**: - 通过缩放归一化层调节输入特征分布[5] - LayerNorm帮助稳定前向输入分布和反向传播梯度[5] ### 3.3 单一认知维度限制 **问题**:现有适配器主要依赖单一线性层压缩上游特征,认知能力有限[3] **解决方案**: - 采用多尺度卷积滤波器从多个认知角度处理上游特征[6] - 模拟人类视觉系统的多尺度认知机制[5][6] ### 3.4 性能上限突破 **问题**:现有增量调优方法无法在视觉识别任务上超越全量微调[1][3] **解决方案**: - Mona成为首个在多个视觉任务上都超越全量微调的适配器方法[3] - 在COCO数据集上比全量微调提升1% mAP,证明了适配器调优范式可以替代全量微调[1][8] 通过这些创新设计,Mona模块成功地将适配器调优的性能推向了新的高度,为视觉任务的高效迁移学习提供了更优的解决方案[3][10]。 ================================================ FILE: module-info/CVPR2025-OverLoCK.md ================================================ # OverLoCK网络模块详解 https://arxiv.org/pdf/2502.20087 ## 1. BasicBlock模块 ### 背景 BasicBlock是OverLoCK网络中Base-Net和Overview-Net的基础构建块。由于这两个子网络主要负责编码低/中级特征和快速生成粗略的全局上下文,因此需要相对简单但有效的模块设计[6][7]。 ### 模块原理 BasicBlock采用以下流水线结构[7]: 1. **残差3×3深度卷积**:首先对输入特征进行局部感知 2. **核心处理块**: - Layer Normalization层:特征标准化 - Dilated RepConv层:扩张重参数化卷积,增强特征表达能力 - SE Layer:通道注意力机制,增强重要特征通道 - ConvFFN:卷积前馈网络,进一步处理特征 ### 解决的问题 - **特征编码效率**:通过简洁的设计快速编码低/中级特征 - **计算复杂度控制**:为Base-Net和Overview-Net提供轻量级但有效的特征提取能力 - **局部特征增强**:通过SE机制和扩张卷积增强局部特征表达 ## 2. DynamicBlock模块 ### 背景 DynamicBlock是Focus-Net的核心构建块,需要在自顶向下上下文指导下进行更精细的感知。由于Focus-Net承担"细看"的任务,需要更复杂和强大的模块来处理精细特征[7]。 ### 模块原理 DynamicBlock包含以下关键组件[7]: 1. **残差3×3深度卷积**:基础的局部特征提取 2. **门控动态空间聚合器(GDSA)**:核心的动态特征处理模块 3. **ConvFFN**:卷积前馈网络进行最终特征处理 **上下文流机制**[7][8]: - 上下文先验Pi和特征图Zi通过拼接融合 - 在块内部实现特征级和权重级的双重指导 - 更新后的上下文先验和特征图被分离输出 ### 解决的问题 - **动态特征处理**:通过GDSA实现基于上下文的动态特征聚合 - **自顶向下指导**:有效利用Overview-Net提供的上下文先验 - **精细感知能力**:在全局上下文指导下实现更准确的细粒度特征提取 ## 3. GDSA(门控动态空间聚合器)模块 ### 背景 GDSA是DynamicBlock的核心组件,旨在实现上下文指导的动态特征聚合。传统的静态卷积无法根据输入内容自适应调整,而GDSA通过引入动态机制和门控机制来解决这一问题[7]。 ### 模块原理 GDSA的处理流程如下[7]: 1. **上下文融合**: - 将上下文先验Pi和特征图Zi拼接 - 通过1×1卷积+SiLU激活处理融合特征 2. **动态卷积处理**: - 使用ContMix(上下文混合动态卷积)作为核心令牌混合器 - 利用上下文先验Pi计算动态卷积核权重 - 实现权重级的上下文指导 3. **门控机制**: - 计算动态门控信号来调制特征图 - 通过元素级乘法实现特征级指导 - 消除上下文噪声,增强有用信息 4. **并行分支融合**: - 门控信号与并行分支输出进行元素级乘法 - 实现自适应的特征选择和增强 ### 解决的问题 1. **长距离依赖建模**: - 通过ContMix使固定尺寸卷积核能够捕获全局信息 - 解决传统卷积感受野受限的问题[3][4] 2. **上下文噪声过滤**: - 门控机制有效过滤无关的上下文信息 - 增强有用的语义指导信号 3. **自适应特征聚合**: - 根据输入内容动态调整特征处理策略 - 实现内容感知的特征增强 4. **归纳偏置保持**: - 在获得全局建模能力的同时保持卷积的局部归纳偏置 - 平衡全局和局部特征表达能力 ## 模块协同工作机制 这三个模块在OverLoCK架构中协同工作,实现了"先总览后细看"的仿生视觉机制: - **BasicBlock**:在Base-Net和Overview-Net中快速编码基础特征和全局上下文 - **DynamicBlock + GDSA**:在Focus-Net中利用上下文指导进行精细化特征处理 - **整体协同**:通过上下文流机制实现自顶向下的语义指导,显著提升网络的特征表达能力[7][8] ================================================ FILE: module-info/CVPR2025-SCSegamba.md ================================================ # SAVSS模块详细总结 https://arxiv.org/pdf/2503.01113 ## 1. 背景 ### 现有方法的局限性 当前裂缝分割方法面临的主要挑战包括[1][2][3]: **CNN方法的限制**: - CNN如ECSNet和SFIAN虽然具有强大的局部归纳特性,但受限的感受野约束了它们建模整个图像中广泛不规则依赖关系的能力[1] - 导致分割不连续和背景噪声抑制能力弱的问题[1] - 即使扩张卷积能扩大感受野,其固有的归纳偏置仍无法完全解决复杂裂缝模式中的重背景干扰问题[1] **Transformer方法的限制**: - 虽然Vision Transformer在捕获不规则像素依赖关系方面表现出色,但注意力计算的二次复杂度导致高内存使用和训练挑战[2] - 限制了在资源受限的边缘设备上的部署和实际应用[2][3] **现有Mamba方法的不足**: - 大多数Mamba方法通过线性层处理特征图,限制了对裂缝特征的选择性增强或抑制能力[3] - 常见的平行或单向对角扫描难以在处理不规则、多方向像素拓扑时保持语义连续性[3] - 在多场景裂缝图像中经常产生误检或漏检[3] ## 2. 模块原理 ### 整体架构 SAVSS(Structure-Aware Visual State Space)模块是SCSegamba的核心组件,包含两个关键设计[5][6]: ### 2.1 门控瓶颈卷积(GBC) **低秩近似原理**: GBC采用瓶颈卷积结构实现参数和计算量的显著降低[7]。假设卷积响应为: ``` z = Qs + c ``` 其中Q是大小为f×(p²×d)的矩阵。通过低秩近似,将其表示为: ``` z = LM^T s + c' ``` 计算复杂度从O(fp²d)降至O(f₀p²d) + O(ff₀)[7]。 **门控机制**: 输入特征x经过以下处理流程[7]: 1. 保留残差连接:`x_residual = x` 2. 生成门控特征:`g1(x) = ReLU(Norm1(f1(x)))` 3. 主分支处理:`x1 = ReLU(Norm2(BottConv2(g1(x))))` 4. 门控分支:`g2(x) = ReLU(Norm3(BottConv3(x)))` 5. 哈达玛积融合:`m(x) = x1 ⊙ g2(x)` 6. 最终输出:`Output = ReLU(Norm4(BottConv4(m(x)))) + x_residual` ### 2.2 结构感知扫描策略(SASS) **四路径设计**: SASS包含四条扫描路径[8]: - 两条平行蛇形路径 - 两条对角蛇形路径 **扫描方程**: 处理方程如下[8]: ``` P = e^(ΔP) Q = (ΔP)^(-1)(e^(ΔP) - I) · ΔQ z_k = Pz_(k-1) + Qw_k u_k = Rz_k + Sw_k ``` 其中: - w ∈ R^(t×D)为输入 - P ∈ R^(G×D)控制隐藏空间状态 - z_k表示时间步k的特定隐藏状态 - u_k表示时间步k的输出 **像素注意力导向融合(PAF)**: 为有效结合初始序列x与经过SS2D处理的序列,集成PAF增强SAVSS捕获裂缝形状和纹理细节的能力[9]。 ## 3. 解决的关键问题 ### 3.1 裂缝形态学信息捕获问题 **问题**:传统方法难以有效建模裂缝的形态学信息和纹理特征[1] **解决方案**: - GBC通过门控机制动态调整权重,增强模型在处理多样化裂缝模式和复杂背景时的适应性[7] - 瓶颈卷积设计在保持裂缝基本特征的同时动态细化主分支的细粒度特征表征[7] ### 3.2 语义连续性保持问题 **问题**:现有扫描策略在处理不规则、多方向裂缝拓扑时难以保持语义连续性[3][8] **解决方案**: - SASS的四路径设计能够有效提取规则裂缝区域的连续语义信息[8] - 同时在多个方向上保持纹理连续性,适用于具有复杂背景的多场景裂缝图像[8] - 实验证明SASS比其他扫描策略的F1和mIoU分别提升0.30%和0.33%[17] ### 3.3 计算效率与性能平衡问题 **问题**:现有方法难以在保持高分割质量的同时实现低计算资源消耗[3] **解决方案**: - 通过低秩近似显著降低计算复杂度,参数量仅2.80M[14] - 四层SAVSS设计在性能和计算需求间取得最佳平衡[21] - 消融实验显示完整SAVSS配置下F1和mIoU分别达到0.8390和0.8479[16] ### 3.4 复杂场景适应性问题 **问题**:在噪声重、低对比度等复杂干扰条件下分割效果不佳[3][15] **解决方案**: - SASS建立多方向邻接关系,使隐藏状态z_k能够捕获更复杂的拓扑和纹理细节[8] - 在塑料跑道复杂裂缝拓扑、金属材料噪声重背景、地下管道低对比度场景中均表现出色[15] - 有效抑制无关噪声,产生高质量分割图[15] 通过这些创新设计,SAVSS模块成功解决了裂缝分割中的关键技术挑战,为实际应用提供了高效可行的解决方案。 ================================================ FILE: module-info/CVPR2025-Transformers without Normalization.md ================================================ # DyT (Dynamic Tanh) 模块详细总结 https://arxiv.org/pdf/2503.10622 ## 1. 背景 ### 归一化层的普遍性与重要性 - **历史地位**: 自2015年Batch Normalization发明以来,归一化层已成为现代神经网络最基础的组件之一[1] - **广泛应用**: Layer Normalization (LN) 在Transformer架构中被广泛使用,几乎所有现代网络都包含归一化层[1][3] - **传统认知**: 归一化层被认为对深度网络的有效训练是**不可或缺的**,这一信念如此根深蒂固,以至于近年来的新架构往往会替换注意力或卷积层,但几乎总是保留归一化层[1] ### 研究动机 通过对训练好的网络进行分析,研究者发现了一个关键观察:**LN层的输入-输出映射呈现tanh函数般的S形曲线**[5]。这一发现启发了DyT方法的设计思路。 ## 2. 模块原理 ### 核心设计思想 DyT的设计基于对归一化层行为的深入理解: - **S形映射**: LN层产生类似tanh的S形输入-输出曲线[5] - **双重效果**: LN层既能缩放输入激活,又能压缩极值[1] - **非线性特性**: 对极值进行非线性压缩,对中心值进行近似线性变换[5][6] ### 数学定义 ``` DyT(x) = γ * tanh(αx) + β ``` 其中: - **α**: 可学习的标量参数,允许根据输入范围动态调整缩放[7] - **γ**: 可学习的逐通道向量参数,用于缩放变换[7] - **β**: 可学习的逐通道向量参数,用于偏移变换[7] - **tanh函数**: 提供有界的S形压缩特性[7] ### 实现特点 - **直接替换**: 可以直接替换现有架构中的归一化层,无需修改其他组件[2][7] - **无统计计算**: 与归一化层不同,DyT不需要计算激活统计量[1] - **逐元素操作**: 对输入张量的每个元素独立操作[7] ### 参数初始化 - **γ**: 初始化为全1向量[7] - **β**: 初始化为全0向量[7] - **α**: 默认初始化为0.5(LLM训练除外)[7] ## 3. 解决了什么问题 ### 主要解决的核心问题 #### 3.1 挑战传统认知 - **打破依赖性**: 证明了Transformer可以在**没有归一化层**的情况下稳定训练并达到相同或更好的性能[1][21] - **理论突破**: 挑战了"归一化层对现代神经网络训练不可或缺"的传统观念[1] #### 3.2 计算效率问题 - **显著提升效率**: 在LLaMA 7B模型中,推理时间减少52.4%,训练时间减少42.2%[12] - **简化计算**: 避免了归一化层中复杂的统计量计算(均值、方差)[1] #### 3.3 架构简化问题 - **实现简单**: 提供了一个极其简单的替代方案,只需要一个tanh函数和几个可学习参数[7] - **易于集成**: 可以直接替换现有架构中的归一化层,无需调整训练超参数[2][7] #### 3.4 性能保持问题 通过大量实验验证,DyT在多个领域都能保持或超越原有性能: - **视觉任务**: 监督学习、自监督学习、扩散模型[8][9] - **语言模型**: LLaMA系列模型[10] - **语音处理**: wav2vec 2.0模型[10][11] - **生物序列**: DNA序列建模[11] #### 3.5 训练稳定性问题 - **稳定训练**: 通过tanh函数的有界特性和α参数的动态调整,确保训练过程的稳定性[12] - **极值处理**: 有效压缩极值激活,防止梯度爆炸或消失[5][6] ### 理论贡献 - **机制理解**: 为理解归一化层的工作机制提供了新的视角[21] - **设计指导**: 为效率导向的网络设计提供了新的选择[12] - **研究启发**: 开辟了无归一化神经网络训练的新研究方向[21] DyT模块的提出不仅提供了一个实用的技术解决方案,更重要的是从根本上重新审视了归一化层在深度学习中的作用,为未来的网络架构设计提供了新的思路和可能性。 ================================================ FILE: module-info/CVPR2025-vHeat.md ================================================ # vHeat模块总结 https://arxiv.org/pdf/2405.16555 ## 1. 背景 ### 现有视觉模型的局限性 - **CNN的限制**:卷积神经网络依赖局部感受野和固定卷积算子,在捕获长程和复杂依赖关系方面存在约束[1] - **ViT的计算瓶颈**:基于自注意力机制的Vision Transformer虽然具有全局特征依赖的优势,但面临O(N²)的计算复杂度问题,在高分辨率图像处理时计算开销巨大[5] - **效率与性能的权衡**:现有改进方法如窗口注意力、线性注意力等在提高效率的同时,往往以牺牲感受野或非线性能力为代价[5] ### 物理启发的动机 研究者从物理热传导领域汲取灵感,发现热传导中的空间局部性对热能传递的重要性与视觉语义在空间域内的传播具有相似性——相邻图像区域在特定尺度下往往包含相关信息或共享相似特征[1]。 ## 2. 模块原理 ### 物理热传导方程基础 vHeat基于二维空间中的经典物理热传导方程[6]: ``` ∂u/∂t = k(∂²u/∂x² + ∂²u/∂y²) ``` 其中: - u(x,y,t)表示时刻t在位置(x,y)的温度 - k > 0为热扩散系数,衡量材料中的热传递速率 ### 热传导算子(HCO)设计 #### 核心实现 将二维温度分布u(x,y,t)扩展到多通道图像特征U(x,y,c,t),HCO的离散实现为[8]: ``` U^t = IDCT2D(DCT2D(U^0) × e^(-k(ωx²+ωy²)t)) ``` #### 关键组件 1. **DCT2D/IDCT2D变换**:使用二维离散余弦变换替代傅里叶变换,基于Neumann边界条件假设,适应视觉数据的矩形约束特性[8] 2. **自适应热扩散系数**: - 通过频率值嵌入(FVEs)预测热扩散系数k[9] - FVEs类似于ViT中的绝对位置嵌入,但工作在频域[9] - 使k能够根据图像内容自适应调整,实现非均匀的视觉热传导[9] 3. **频域滤波机制**: - 系数矩阵e^(-k(ωx²+ωy²)t)在频域中充当自适应滤波器[10] - 不同频率值对应不同图像模式(高频对应边缘和纹理,低频对应平坦区域)[10] ### 网络架构集成 - **分层设计**:采用4阶段分层架构,分辨率从H/4×W/4逐渐降低到H/32×W/32[7] - **热传导层**:类似ViT块,但用HCO替代自注意力算子,保留前馈网络[9] - **深度卷积增强**:结合3×3深度卷积层进行特征提取[9] ## 3. 解决了什么问题 ### 计算复杂度问题 - **显著降低复杂度**:从自注意力的O(N²)降低到O(N^1.5),大幅提升计算效率[1][3] - **高分辨率优势**:当输入图像分辨率增加到768×768时,相比Swin-B实现3倍吞吐量提升、80%更少GPU内存占用、35%更少计算FLOPs[3] ### 全局感受野与效率的统一 - **全局信息感知**:通过频域操作,每个DCT元素都包含来自图像空间所有块的信息,实现全局感受野[3] - **高效并行化**:DCT和IDCT操作具有高并行性,提升训练和测试效率[3] ### 模型可解释性 - **物理基础**:基于可解释的物理热传导原理,相比基于token相似性的自注意力机制更具物理意义[10] - **直观理解**:温度U(x,y,c,t)对应视觉特征,热传导过程模拟信息传播,提供清晰的物理解释[10] ### 性能提升 在多个视觉任务上实现性能提升[11][12][13]: - **图像分类**:vHeat-B在ImageNet-1K上达到84.0%准确率,超越Swin-B 0.5% - **目标检测**:在COCO数据集上consistently优于基线模型 - **语义分割**:在ADE20K上实现更高的mIoU - **泛化能力**:在鲁棒性评估和低级视觉任务上表现优异[13][14] ### 自适应特征表示 通过预测的热扩散系数k实现自适应视觉热传导,能够根据图像内容动态调整信息传播模式,相比固定参数的方法更加灵活和有效[15]。 ================================================ FILE: module-info/ICLR2025-Pola.md ================================================ # PolaFormer中的Pola模块总结 https://arxiv.org/pdf/2501.15061 ## 1. 背景 ### 传统线性注意力的局限性 传统的Transformer自注意力机制具有O(N²)的二次复杂度,在处理长序列或高分辨率图像时计算开销巨大[1]。为解决这一问题,线性注意力方法通过核化特征映射将复杂度降低到O(Nd²)[2]。 ### 现有线性注意力的不足 现有线性注意力方法存在两个关键问题[2]: 1. **信息丢失严重**:使用ReLU、ELU+1等非负特征映射时,只保留正-正交互,完全丢弃负-负和正-负交互信息 2. **注意力过于均匀**:缺乏softmax的指数缩放特性,导致注意力权重分布均匀,熵值过高,无法有效区分重要和不重要的查询-键对 如图1所示,传统线性注意力生成的注意力图过于均匀,而PolaFormer能够产生更接近softmax的尖锐注意力分布[1]。 ## 2. 模块原理 ### 2.1 极性感知分解 Pola模块的核心是将查询向量q和键向量k按极性分解[7]: ``` q = q⁺ - q⁻ k = k⁺ - k⁻ ``` 其中: - q⁺ᵢ = max(qᵢ, 0),q⁻ᵢ = max(-qᵢ, 0) - k⁺ᵢ = max(kᵢ, 0),k⁻ᵢ = max(-kᵢ, 0) ### 2.2 完整交互建模 原始查询-键内积可以分解为四种交互类型[7]: ``` ⟨q, k⟩ = ⟨q⁺, k⁺⟩ + ⟨q⁻, k⁻⟩ - ⟨q⁺, k⁻⟩ - ⟨q⁻, k⁺⟩ └─────同号交互─────┘ └─────异号交互─────┘ ``` 传统线性注意力只保留第一项,Pola模块则显式处理所有四种交互。 ### 2.3 可学习极性混合 为避免直接减法操作导致的不稳定性,Pola模块采用可学习混合策略[7]: 1. **值向量分割**:将值向量v沿通道维度分为两半:v = [vₛ; vₒ] 2. **分流处理**: - 同号流:处理⟨q⁺, k⁺⟩ + ⟨q⁻, k⁻⟩交互,使用vₛ - 异号流:处理⟨q⁺, k⁻⟩ + ⟨q⁻, k⁺⟩交互,使用vₒ 3. **系数调节**:通过可学习矩阵Gₛ和Gₒ分别调节两个流的贡献 ### 2.4 降熵幂函数 基于理论分析,Pola模块采用可学习幂函数降低注意力熵值[9]: ``` p = 1 + α sigmoid(w₁, ..., wₐ) g(x; p) = (x₁^p₁, ..., xₐ^pₐ) ``` **理论保证**:定理1证明了具有正一阶和二阶导数的函数g可以降低正序列熵(PSE)[9][26]。 ## 3. 解决的问题 ### 3.1 信息完整性问题 **问题**:传统线性注意力丢失负值交互信息,导致表达能力不足[2] **解决方案**: - 通过极性分解显式建模所有四种查询-键交互类型[7] - 实验显示极性系数Gₛ和Gₒ学习到明显的负相关关系,证明了互补性[8] ### 3.2 注意力尖锐性问题 **问题**:线性注意力权重过于均匀,熵值高,无法聚焦重要信息[2] **解决方案**: - 理论证明并采用可学习幂函数有效降低注意力熵值[9] - 可视化结果显示PolaFormer的注意力熵值(H=2.30/2.45)显著低于传统线性注意力(H=3.72)[31] ### 3.3 计算效率问题 **问题**:在保持线性复杂度的同时提升性能 **解决方案**: - 总复杂度仍为O(Nd²),保持线性特性[10] - 实现1.15×-1.32×的推理加速[12] - 在ImageNet-1K上相比基线提升2.4%-3.7%性能[11][17] ### 3.4 低秩退化问题 **问题**:softmax矩阵固有的低秩特性可能导致退化解[8] **解决方案**: - 引入深度卷积(DWC)等技术增加矩阵秩[8][14] - 消融研究证明DWC比可变形卷积效果更好[14] 通过这些创新设计,Pola模块成功地在保持线性复杂度的前提下,显著提升了线性注意力的表达能力和性能表现。 ================================================ FILE: module-info/ICLR2025-ToST.md ================================================ # Token Statistics Self-Attention (TSSA) 模块总结 https://arxiv.org/pdf/2412.17810 ## 1. 背景 ### 传统注意力机制的挑战 传统Transformer的自注意力机制存在显著的计算瓶颈: - **二次复杂度问题**:需要计算所有token对之间的相似性,导致计算和内存复杂度随token数量呈二次增长 [1] - **成对相似性依赖**:核心操作是scaled dot product attention,通过"key"和"query"参数矩阵计算token对的缩放点积相似性 [1] - **计算负担沉重**:这种设计在处理长序列时带来巨大的计算开销,成为扩展性的主要障碍 [1][2] ### 现有解决方案的局限 已有的高效注意力方法主要包括: - 将token分块处理 [2] - 使用滑动窗口注意力 [2] - 寻找合适的低秩投影 [2] - 通过Nyström扩展近似计算 [2] 但这些方法本质上仍然依赖或近似成对相似性计算,没有从根本上突破传统注意力的设计范式 [2]。 ### 理论动机 研究发现,自注意力操作本质上是一种核回归形式,通过学习的相似性度量对"相似"的输入token进行加权平均 [2]。这启发了一个更抽象的思考:注意力操作可以被视为基于输入token统计量产生输出的更一般算子类别的特例 [2]。 ## 2. 模块原理 ### 核心数学框架 #### MCR2变分形式 TSSA基于最大编码率降低(MCR2)目标函数的新变分形式。作者证明了定理1:对于凹函数f,存在上界: ``` F(M) ≤ Σf((Q^T MQ)_ii) ``` 这允许通过计算矩阵乘积对角线元素的标量函数来上界大矩阵的谱函数 [7][8]。 #### 变分目标函数 基于此理论,构建变分压缩目标: ``` R^var_c,f(Z,Π|{U_k}) = (1/2)Σ(n_k/n)Σf((1/n_k)(U_k^T Z Diag(π_k) Z^T U_k)_ii) ``` 其中U_k是正交矩阵,π_k是组成员分配向量 [8]。 #### TSSA操作公式 通过对变分目标进行梯度下降,得到TSSA的核心更新公式: ``` z_j^+ = z_j - (τ/n)Σ Π_jk U_k D(Z,π_k|U_k) U_k^T z_j ``` 其中: - **Π_jk**:token j属于组k的概率 - **U_k**:第k个注意力头的投影矩阵 - **D(Z,π_k|U_k)**:基于二阶矩统计量的对角矩阵 [9][10] ### 操作机制解释 #### 统计量计算 TSSA的核心是计算投影token特征的二阶矩统计量: ``` (U_k^T Z)⊙2 π_k/⟨π_k,1⟩ ``` 这估计了在分布π_k/⟨π_k,1⟩下U_k^T Z的二阶矩 [10]。 #### 数据依赖投影 TSSA执行近似的低秩数据依赖投影操作[I - (τ/n)U_k D_k U_k^T]: - **大功率方向**:具有大二阶矩的方向被保留(D_k中对应元素接近0) - **小功率方向**:具有小二阶矩的方向被抑制(D_k中对应元素较大)[10][11] #### 组成员分配 使用基于高斯混合模型的后验概率估计组成员: ``` Π_jk ∝ exp((1/2η)||U_k^T z_j||_2^2) ``` 其中η是可学习的温度参数 [12][13]。 ### 实现细节 #### 复杂度优势 - **时间复杂度**:O(pn),其中p是投影维度,n是token数量 - **空间复杂度**:O(p) - 相比传统注意力的O(pn²)时间和O(n²)空间复杂度有显著改进 [13] #### 实际优化 1. **正交性放松**:实践中不严格执行U矩阵的正交约束 2. **L2归一化**:对投影token进行L2归一化以稳定训练 3. **可学习参数**:将理论中的常数系数吸收到可学习参数中 [29][30] ## 3. 解决了什么问题 ### 计算效率问题 **问题**:传统自注意力的O(n²)复杂度在长序列处理中造成计算瓶颈 **解决方案**:TSSA实现O(n)线性复杂度,显著提升计算效率。实验显示,对于10k个token,TOSS比ViT快约10倍,内存使用减少约100倍 [1][35] ### 内存占用问题 **问题**:传统注意力需要存储n×n的注意力矩阵,内存需求随序列长度二次增长 **解决方案**:TSSA只需要存储O(p)的统计量信息,大幅降低内存占用 [13] ### 可扩展性问题 **问题**:传统Transformer在处理长序列时面临严重的扩展性挑战 **解决方案**:线性复杂度使TOST能够高效处理长序列任务。在Long-Range Arena基准测试中,TOST在Transformer类模型中表现最佳 [18] ### 理论理解问题 **问题**:传统注意力机制缺乏清晰的数学解释和可解释性 **解决方案**:TSSA基于MCR2理论提供了明确的数学推导,每层操作都有清晰的优化目标。可视化实验验证了模型确实在逐层优化设计目标 [16] ### 设计范式问题 **问题**:传统观念认为成对相似性计算对Transformer成功至关重要 **解决方案**:TSSA证明了不依赖成对相似性的注意力机制同样有效,挑战了传统设计范式。实验显示TOST在多个任务上达到了与传统Transformer相当的性能 [3][17] ### 语义理解问题 **问题**:传统注意力机制在语义聚类和分割方面需要复杂的训练策略 **解决方案**:TSSA通过统计量驱动的分组机制自动学习语义聚类,无需额外的监督信号。可视化显示TOST能够自动进行有意义的前景分割 [16][17] 总体而言,TSSA通过从统计学角度重新思考注意力机制,不仅解决了计算效率问题,还提供了更好的理论基础和可解释性,为Transformer架构的发展开辟了新的方向。 ================================================ FILE: module-info/TPAMI2025-HyperYOLO.md ================================================ # Mixed Aggregation Network (MANet) 模块总结 ## 1. 背景 传统YOLO系列方法的骨干网络主要依赖单一的基础模块进行特征提取,如YOLOv8中的C2f模块。这种单一结构限制了信息流的多样性和特征提取能力[7]。为了增强骨干网络的特征辨别能力,需要设计更加丰富和多样化的特征聚合机制来提升基础网络的特征提取能力。 ## 2. 模块原理 MANet通过协同融合三种典型的卷积变体来实现混合聚合[7]: ### 核心组件 - **1×1旁路卷积**:用于通道级特征重校准 - **深度可分离卷积(DSConv)**:用于高效的空间特征处理 - **C2f模块**:用于增强特征层次集成 ### 计算流程[8] ``` Xmid = Conv1(Xin) // 输入通道扩展到2c X1 = Conv2(Xmid) // 1×1卷积分支 X2 = DSConv(Conv3(Xmid)) // 深度可分离卷积分支 X3, X4 = Split(Xmid) // 分割用于C2f处理 // C2f模块的迭代处理 X5 = ConvNeck1(X4) + X4 X6 = ConvNeck2(X5) + X5 ... Xout = Convo(X1||X2||...||X4+n) // 特征融合和压缩 ``` ### 配置优化 通过消融实验确定最优的卷积核尺寸配置[k2, k3, k4, k5] = [3, 5, 5, 3],在性能和参数数量之间取得平衡[16]。 ## 3. 解决的问题 ### 信息流多样性不足 - **问题**:单一的C2f模块限制了梯度流的丰富性和多样性 - **解决**:通过三种不同的卷积结构产生更加多样化和丰富的梯度流,显著放大了基础特征在五个关键阶段中封装的语义深度[7] ### 特征提取能力受限 - **问题**:传统单一模块无法充分利用不同类型的特征表示 - **解决**:混合聚合机制整合了三种经典结构,实现更丰富的信息流动。实验显示,在相同颈部网络下,MANet比C2f模块在所有指标上都表现更优,APval提升1.5个百分点[16] --- # HyperC2Net 模块总结 ## 1. 背景 传统YOLO模型的颈部设计存在显著局限性[2]: - **PANet局限**:主要局限于相邻层之间的特征融合,无法充分解决跨层级特征集成问题 - **Gold-YOLO不足**:虽然促进了层间信息交换,但仍无法实现特征图内的跨位置交互 - **高阶相关性缺失**:未能充分探索特征相互关系的潜力,特别是涉及高阶相关性的复杂非线性关系[2] ## 2. 模块原理 ### HGC-SCS框架实现 HyperC2Net是HGC-SCS框架的具体实例化,包含三个核心阶段[10]: #### 语义收集阶段 ``` Xmixed = B1||B2||B3||B4||B5 ``` 将来自骨干网络五个阶段的特征图{B1, B2, B3, B4, B5}进行通道级连接,合成跨层级视觉特征[9]。 #### 超图构建与计算 - **顶点构建**:将网格化的视觉特征解构为超图的顶点集合V - **超边构建**:使用距离阈值构建ε-球作为超边[9] ``` E = {ball(v, ε) | v ∈ V} ball(v, ε) = {u | ||xu - xv||d < ε, u ∈ V} ``` - **超图卷积**:采用空间域超图卷积进行高阶消息传递[10] ``` HyperConv(X, H) = X + D⁻¹ᵥHD⁻¹ₑH^T XΘ ``` #### 语义散射阶段 ``` N3, N4, N5 = ϕ(Xhyper, B3), ϕ(Xhyper, B4), ϕ(Xhyper, B5) ``` 将高阶结构信息分散到最终的三个检测尺度[10]。 ### 关键技术特点 - **五尺度融合**:操作跨越五个尺度,突破传统网格结构限制 - **跨层级跨位置**:允许不同层级和位置之间的复杂高阶交互[3] ## 3. 解决的问题 ### 跨层级特征融合限制 - **问题**:PANet仅能融合相邻层信息,这种邻接约束的融合模式限制了网络内信息集成的广度[11] - **解决**:HyperC2Net能够直接融合来自骨干网络的五层特征,实现更强大和多样化的信息流,缩小了不同深度特征之间的连接差距[11] ### 跨位置交互缺失 - **问题**:传统颈部设计不能实现特征图内的跨位置交互,Gold-YOLO虽然能跨层级但不支持跨位置[11] - **解决**:通过超图计算实现非网格约束的信息流动,支持跨层级和跨位置的高阶信息传播,突破了传统网格结构的限制[11] ### 高阶相关性建模不足 - **问题**:传统方法无法充分利用视觉数据中复杂的高阶相关性和非线性关系[3] - **解决**:通过超图计算捕获特征图中潜在的复杂高阶关联,生成的特征表示综合考虑了语义特征和高阶结构特征[11] ### 性能提升验证 消融实验显示高阶学习相比低阶学习APval提升0.4个百分点[16]。公平比较实验中,仅将YOLOv9的颈部替换为HyperC2Net: - Hyper-YOLOv1.1-T相比YOLOv9-T提升2.0 APval - Hyper-YOLOv1.1-S相比YOLOv9-S提升1.2 APval[15] 这验证了高阶学习方法在目标检测任务中的有效性。 ================================================ FILE: mutilmodel-project.md ================================================ # 2025-YOLO|RTDETR多模态目标检测项目 对于当今的视觉任务来说,最简单入手的便是YOLO系列,通过ultralytics库的帮助下,无论是否来自计算机科班的同学基本都可以快速构建自己的目标检测模型。但是与简单方便相伴而来的是现在的YOLO系列模型的整体拒稿率越来越高,甚至与很多期刊或导师看到YOLO四个字便直接Reject,即使组合出性能优异的检测模型也难以发表到心仪的期刊上去,因此单靠单模态的YOLO发有点要求的期刊已经开始显得有些吃力。很多人尝试转向RT-DETR模型,对于从YOLO迁移过去的人来说一样简单好用,但是RTDETR的训练成本要比YOLO系列模型略高,因此对于部分没有服务器/自费服务器的同学来说可能有点难接受。虽然单模态的YOLO确实显得吃力,但是多模态的YOLO就不是这样了,从去年开始多模态就开始慢慢火起来,但由于缺乏相对应的教程,让很多人望而止步,从去年到今年,也越来越多人问,有没有多模态相关的YOLO改进项目?别急,它终于要来了,而且还不止YOLO,RTDETR的多模态也有! ## 1. 这个项目包含什么内容? 1. 这个项目主体思路是在尽可能的保证继承ultralytics库简单好用的基础上为YOLO与RT-DETR现阶段这两个最热门的目标检测器,提供出多模态的能力。<可以理解为YOLO|RTDETR的多模态进阶版> 2. 这个项目的核心是在原有可见光(RGB图像的基础上)结合红外或深度图谱(以及其他对齐后的图张量数据)实现多模态信息结合的能力。 3. 同时根据自身的工作经验,我们在项目中提供大量不同的多模态模型结构基础模型进行对应的实验选择。 4. 在项目中我提供了灵活自由的模型配置方式<本项目基于Ultralytics的YOLO以及 RTDETR 模型进行对应的修改>通过使用不同的模型 yaml配置方式实现调用不同的模型配置结构,同时拥有几百个改进点的改进项目结合多模态直接起飞~ 5. 当前阶段仅考虑支持目标检测,实例分割,旋转目标检测。不支持姿态检测。 6. 项目内容提供深度模态,DEM 模态的生成。不提供红外模态的生成 7. 本项目不提供非对齐多模态图像的支持,不提供模态配准的内容,不提供数据集。 ## 2. 这个项目会以什么形式开展? 1. 本次项目核心目的在于为大开箱即用的完善的图图多模态目标检测项目,由于架构设计的内容如果魔导的其他Ultralytics项目内的改进点也可以迁移到多模态项目中(例如v8v10、v11v12、rtdetr改进项目中)。 2. 项目内我将提供多种不同形式,融合思路的模型配置,大家可以在其中选择一个进行改进创建。同时未来也会在项目中提供一些模块方便大家组合实验。 3. 这个项目会以未来持续更新的态势进行扩展,包括支持更多多模态基础模型以及不同的实验功能,还有专属于多模态项目以及通用的改进模块。考虑到工作与时间上的问题这会是一个持续更新的过程,大家也不用着急。 4. 附带答疑群,群里主要是答疑实验,代码操作,代码报错等问题。考虑到个人空闲时间问题不一定每一个问题都能及时回答,也可以在群里询问其他大佬的帮助。一些反复出现的高频问题也会收集录制对应的答疑视频来给大家解答。我本人也会在群里给一些多模态写作投稿的思路与建议。 ## 3. 入手须知 1. 本项目毕竟是为YOLO以及RT-DETR系列做的扩展,因此建议在已经有了ultralytics库的使用经验后来使用本项目。同时为了达到最佳效果,强烈建议搭配魔导的相关改进项目来配合使用。 以下人群非常不建议入手此项目: - 未入门、1000%计算机小白(可以考虑先补充相关的基础知识)。 - 不想花时间学习,不想了解多模态结构,仅仅只想水论文。 - 不喜欢看说明或使用文档的。 - 没有跑过ultralytics 库经验的。 2. 此项目不涉及多模态数据中的配准相关问题。 3. 考虑到架构复杂性问题以及多模态结构的特殊性,所以不会考虑提供多模态的剪枝蒸馏在内。但是会考虑提供生成模态的办法作为数据集来源缺失的补充。(生成模态办法主要以深度方面,采用成熟深度学习代码包括一些顶会的工作进行相关模态生成。由于生成模态的作用因此可以在单一模态数据集上进行额外扩展,实现一集多用的办法同时避免配准的问题。) 4. 本项目仅包含图像相关的多模态,不包含图像+文字的多模态。 5. 本项目的环境建议在torch2.0以上版本跑。有一些专门的优化API调用。模型显存占用,体积会比单模态较大,但是不用担心,速度不会降低很多,依然是快速的训练。 ## 4. 价格 1. 本项目价格为288,购买过其中之一的优惠50,优惠后价格为238。没有时效限制。 2. 虚拟项目一经售出不退不换,需要入手前考虑清楚,如果你是初次入手我的项目,怕我不靠谱,可以先考虑入手个YOLO和RTDETR看下。 3. 如果确定需要购买的话,请把以下的内容原封不动复制给汤圆,“确定2购买5多模态3项目” ## 5. 项目使用问题 1. 购买本项目的使用者都会得到一个独一无二的用于解压7z的密码,到时候用于解压对应的压缩包,此密码自己妥善保管,请勿告诉他人。 2. 本项目的视频和直播回放统一都是加密视频,每个购买者都可以得到一个激活码,激活码在每个人专属的7z压缩文件内。 ## 6. 更新日志 2025年12月 - 多模态旋转框(OBB)支持:新增训练/验证/预测脚本与 OBB 模型 YAML - 数据集加载修复:支持 .npz .npy 等文件形式加载 - 离线模态生成器:新增 DepthGen 深度图生成器、DEM 特征生成器、EdgeGen 边缘模态生成器 - 可视化系统增强:完善色彩空间与模态消融支持,增强分辨率控制与素材导出。 - 模块新增:新增三十余个模块与其对应配置文件 2025年11月 - 多模态路由:添加动态通道路由与预测器路由兼容性改进,并严格化单模态语义 - 网络与配置扩展:新增 LSCD 轻量化检测头、SOEP 小目标增强颈部模块、门控融合模块,C3k2,C2PSA等变体模块并补充大量多模态 YAML 配置 - 评估指标增强:移植/完善 COCO 评估,并扩展 COCO 尺寸分级 IoU 指标 2025年10月 - 修复RTDETR多模态预测器bbox坐标归一化偏移问题 - 修复RTDETRMM验证器tensor操作,完善RTDETRMM验证器的指标计算 - 优化残差融合架构并统一版本标识系统 2025年9月 - 多模态分割支持:实现YOLOMM多模态分割完整功能 - 可视化系统重构:重构为组件化Pipeline架构 - 性能优化:添加GFLOPs性能指标和统一profile接口 - 修复YOLOMM任务自动检测与类型兼容性 - YOLOv5/v9/v10多模态配置 2025年8月 - 高级融合模块:实现SOTA融合算法(CTF多头交叉注意力、FFN FCM等) - FCM/FFN模块 - DEYOLO系列:DEA、DECA、DEPA、BiFocus、C2f_BiFocus - CAM跨模态注意力机制 - CTF多头交叉注意力 - ICAFusion变体 - RD架构模块 - 对比学习系统:实现基础对比学习与特征捕获架构 - 多模态增强:完成IR专属增强和深度增强系统 - Wiki系统:构建项目内置文档说明系统 - 路由系统优化:统一MultiModalRouter接管软填充与消融 - 预测可视化重构:统一绘图组件与多模态输出 - 强化FP32数值稳定性与调试系统 2025年7月 - 可视化系统:实现完整Grad-CAM热力图和特征图可视化 - COCO验证功能:实现COCOMetrics类和YOLO到COCO格式转换器 - 可视化API统一:为YOLOMM和RTDETRMM添加vis()方法 - 支持多层独立可视化和letterbox预处理 - 修复多模态验证器参数显示问题 ================================================ FILE: objectdetection-tricks/readme.md ================================================ # objectdetection-tricks 这个项目主要是提供一些关于目标检测的tricks. # Explanation - **tricks_1** 可视化并统计目标检测中的TP,FP,FN. 视频教学地址:[可视化-哔哩哔哩](https://www.bilibili.com/video/BV18M411c7jN/). [统计-哔哩哔哩](https://www.bilibili.com/video/BV1yM4y1d7Gp/). - **tricks_2** 深度学习小实验-卷积家族(fps,flops,param)对比实验. 目前支持:Conv,DWConv,Ghost-Conv,GSConv,DSConv,PConv,DCNV2,DCNV3. 视频教学地址:[3.8 哔哩哔哩](https://www.bilibili.com/video/BV15x4y1T7Ly/). [3.19 哔哩哔哩](https://www.bilibili.com/video/BV1UL411R7Qr/). - **tricks_3** yolov5中的FeatureMap可视化(热力图格式). 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1LV4y1R7w6/). - **tricks_4** 用于yolov5和v7中的yolo格式转换coco格式的脚本.(如何在v5和v7中输出ap_small,ap_middle,ap_large coco指标) 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV14T411s7Ts/). - **tricks_5** Segment Anything演示代码. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1hv4y1H7eg/). - **tricks_6** 固定随机种子以便在同一个主机上进行复现结果. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1bh4y1n7Yc/). - **tricks_7** 计算yolov5推理时间和FPS的脚本. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Uu4y1C714/). - **tricks_8** 计算yolov7推理时间和FPS的脚本. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV17p4y177Pe/). - **tricks_9** 深度学习小实验-YOLO-Block家族(fps,flops,param)对比实验. 目前支持:C3(Yolov5),ELAN(Yolov7),C2f(Yolov8)RepNCSPELAN(Yolov9). 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV17H4y1V7s9/). - **tricks_10** 输出YOLOV8、RTDETR各个层的计算量和参数量. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1tb421b7aB/). - **tricks_11** 以YOLOV8为例,保存多个模型的PR曲线的数据并进行读取绘制到一张图上. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1uC41177oE/). - **tricks_12** yolov5、v7、v8、v9、v10曲线对比图、推理时间vs精度对比图绘制手把手教程. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1yf421X7t5/). - **tricks_13** YOLOV8-输出每一层的图特征图尺寸和通道数. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Mz421B7xz/). - **tricks_14** YOLOV8V10V11V12更详细的输出精度结果. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1dBQDY6Ec5/). - **tricks_15** 1. 统计YOLO格式数据集中每个类别的实例数和对应小中大目标的实例数。 2. 可视化YOLO格式数据集中的标签。 3. 去掉YOLO格式数据集中的部分类别并类别重新排序。 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1k2TizGEnH). - **tricks_16** 用于调试生成COCO指标的文件. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1SdNizEE4X/). ================================================ FILE: objectdetection-tricks/tricks_1.py ================================================ import os, cv2, tqdm, shutil import numpy as np def xywh2xyxy(box): box[:, 0] = box[:, 0] - box[:, 2] / 2 box[:, 1] = box[:, 1] - box[:, 3] / 2 box[:, 2] = box[:, 0] + box[:, 2] box[:, 3] = box[:, 1] + box[:, 3] return box def iou(box1, box2): x11, y11, x12, y12 = np.split(box1, 4, axis=1) x21, y21, x22, y22 = np.split(box2, 4, axis=1) xa = np.maximum(x11, np.transpose(x21)) xb = np.minimum(x12, np.transpose(x22)) ya = np.maximum(y11, np.transpose(y21)) yb = np.minimum(y12, np.transpose(y22)) area_inter = np.maximum(0, (xb - xa + 1)) * np.maximum(0, (yb - ya + 1)) area_1 = (x12 - x11 + 1) * (y12 - y11 + 1) area_2 = (x22 - x21 + 1) * (y22 - y21 + 1) area_union = area_1 + np.transpose(area_2) - area_inter iou = area_inter / area_union return iou def draw_box(img, box, color): cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color, thickness=2) return img if __name__ == '__main__': postfix = 'jpg' img_path = 'image' label_path = 'label' predict_path = 'predict' save_path = 'vis' classes = ['train', 'diningtable', 'person', 'bus', 'pottedplant', 'chair', 'cat', 'tvmonitor', 'motorbike', 'sofa', 'cow', 'bottle', 'aeroplane', 'dog', 'horse', 'car', 'boat', 'sheep', 'bicycle', 'bird'] detect_color, missing_color, error_color = (0, 255, 0), (0, 0, 255), (255, 0, 0) iou_threshold = 0.45 if os.path.exists(save_path): shutil.rmtree(save_path) os.makedirs(save_path, exist_ok=True) all_right_num, all_missing_num, all_error_num = 0, 0, 0 with open('result.txt', 'w') as f_w: for path in tqdm.tqdm(os.listdir(label_path)): image = cv2.imread(f'{img_path}/{path[:-4]}.{postfix}') if image is None: print(f'image:{img_path}/{path[:-4]}.{postfix} not found.', file=f_w) h, w = image.shape[:2] try: with open(f'{predict_path}/{path}') as f: pred = np.array(list(map(lambda x:np.array(x.strip().split(), dtype=np.float32), f.readlines()))) pred[:, 1:5] = xywh2xyxy(pred[:, 1:5]) pred[:, [1, 3]] *= w pred[:, [2, 4]] *= h pred = list(pred) except: pred = [] try: with open(f'{label_path}/{path}') as f: label = np.array(list(map(lambda x:np.array(x.strip().split(), dtype=np.float32), f.readlines()))) label[:, 1:] = xywh2xyxy(label[:, 1:]) label[:, [1, 3]] *= w label[:, [2, 4]] *= h except: print(f'label path:{label_path}/{path} (not found or no target).', file=f_w) right_num, missing_num, error_num = 0, 0, 0 label_id, pred_id = list(range(label.shape[0])), [] if len(pred) == 0 else list(range(len(pred))) for i in range(label.shape[0]): if len(pred) == 0: break ious = iou(label[i:i+1, 1:], np.array(pred)[:, 1:5])[0] ious_argsort = ious.argsort()[::-1] missing = True for j in ious_argsort: if ious[j] < iou_threshold: break if label[i, 0] == pred[j][0]: image = draw_box(image, pred[j][1:5], detect_color) pred.pop(j) missing = False right_num += 1 break if missing: image = draw_box(image, label[i][1:5], missing_color) missing_num += 1 if len(pred): for j in range(len(pred)): image = draw_box(image, pred[j][1:5], error_color) error_num += 1 all_right_num, all_missing_num, all_error_num = all_right_num + right_num, all_missing_num + missing_num, all_error_num + error_num cv2.imwrite(f'{save_path}/{path[:-4]}.{postfix}', image) print(f'name:{path[:-4]} right:{right_num} missing:{missing_num} error:{error_num}', file=f_w) print(f'all_result: right:{all_right_num} missing:{all_missing_num} error:{all_error_num}', file=f_w) ================================================ FILE: objectdetection-tricks/tricks_10.py ================================================ import torch, thop from thop import profile from ultralytics import YOLO, RTDETR from prettytable import PrettyTable if __name__ == '__main__': batch_size, height, width = 1, 640, 640 model = YOLO(r'ultralytics/cfg/models/yolov8/yolov8n.yaml').model # select your model.pt path # model = RTDETR(r'ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml').model model.fuse() input = torch.randn(batch_size, 3, height, width) total_flops, total_params, layers = profile(model, [input], verbose=True, ret_layer_info=True) FLOPs, Params = thop.clever_format([total_flops * 2 / batch_size, total_params], "%.3f") table = PrettyTable() table.title = f'Model Flops:{FLOPs} Params:{Params}' table.field_names = ['Layer ID', "FLOPs", "Params"] for layer_id in layers['model'][2]: data = layers['model'][2][layer_id] FLOPs, Params = thop.clever_format([data[0] * 2 / batch_size, data[1]], "%.3f") table.add_row([layer_id, FLOPs, Params]) print(table) ================================================ FILE: objectdetection-tricks/tricks_11.py ================================================ import numpy as np import pandas as pd import matplotlib.pyplot as plt if __name__ == '__main__': file_list = ['a/face_Box.csv', 'b/face_Box.csv'] names = ['improve', 'baseline'] ap = ['0.673', '0.639'] plt.figure(figsize=(6, 6)) for i in range(len(file_list)): pr_data = pd.read_csv(file_list[i], header=None) recall, precision = np.array(pr_data[0]), np.array(pr_data[1]) plt.plot(recall, precision, label=f'{names[i]} ap:{ap[i]}') plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend() plt.tight_layout() plt.savefig('pr.png') ================================================ FILE: objectdetection-tricks/tricks_12.py ================================================ import pandas as pd import numpy as np import matplotlib.pylab as plt def deal_yolov7_result(data_path): with open(data_path) as f: data = np.array(list(map(lambda x:np.array(x.strip().split()), f.readlines()))) return data if __name__ == '__main__': epoch = 50 yolov5_result_csv = '/home/hjj/Desktop/github_code/yolov5/runs/train/yolov5n-crowdhuman/results.csv' yolov7_result_csv = '/home/hjj/Desktop/github_code/yolov7/runs/train/yolov7-tiny-crowdhuman/results.txt' yolov8_result_csv = '/home/hjj/Desktop/github_code/ultralytics/runs/train/yolov8n-crowdhuman/results.csv' yolov9_result_csv = '/home/hjj/Desktop/github_code/yolov9/runs/train/yolov9s-corwdhuman/results.csv' yolov10_result_csv = '/home/hjj/Desktop/github_code/yolov10/runs/train/yolov10n-crowdhuman/results.csv' yolov5_result_data = pd.read_csv(yolov5_result_csv) yolov7_result_data = deal_yolov7_result(yolov7_result_csv) yolov8_result_data = pd.read_csv(yolov8_result_csv) yolov9_result_data = pd.read_csv(yolov9_result_csv) yolov10_result_data = pd.read_csv(yolov10_result_csv) plt.figure(figsize=(10, 8)) # 调整图形大小 plt.plot(np.arange(epoch), yolov5_result_data[' metrics/mAP_0.5'], label='yolov5n', linewidth=2) plt.plot(np.arange(epoch), np.array(yolov7_result_data[:, 11], dtype=float), label='yolov7-tiny', linewidth=2) plt.plot(np.arange(epoch), yolov8_result_data[' metrics/mAP50(B)'], label='yolov8n', linewidth=2) plt.plot(np.arange(epoch), yolov9_result_data[' metrics/mAP_0.5'], label='yolov9s', linewidth=2) plt.plot(np.arange(epoch), yolov10_result_data[' metrics/mAP50(B)'], label='yolov10n', linewidth=2) plt.xlabel('Epoch', fontsize=14) # 调整x轴标签字体大小 plt.ylabel('mAP@0.5', fontsize=14) # 调整y轴标签字体大小 plt.legend(fontsize=20) # 调整图例字体大小 plt.xticks(fontsize=12) # 调整x轴刻度字体大小 plt.yticks(fontsize=12) # 调整y轴刻度字体大小 plt.title('YOLO CrowdHuman mAP50 Curve', fontsize=20) plt.tight_layout() plt.savefig('mAP50-curve.png') data_dict = { 'yolov5n':[0.672, 0.1+3.2+0.7, '+'], 'yolov7-tiny':[0.74, 4.0, '*'], 'yolov8n':[0.711, 4.5, 'x'], 'yolov9s':[0.772, 9.9, 'D'], 'yolov10n':[0.727, 5.3, '_'] } plt.figure(figsize=(10, 8)) # 调整图形大小 for model_name in data_dict: print(data_dict[model_name][1], data_dict[model_name][0]) plt.scatter(data_dict[model_name][1], data_dict[model_name][0], label=model_name, marker=data_dict[model_name][2], s=500) plt.xlabel('Inference Time(ms/img)', fontsize=14) # 调整x轴标签字体大小 plt.ylabel('mAP@0.5', fontsize=14) # 调整y轴标签字体大小 plt.legend(fontsize=20, loc=4) # 调整图例字体大小 plt.xticks(fontsize=12) # 调整x轴刻度字体大小 plt.yticks(fontsize=12) # 调整y轴刻度字体大小 plt.title('inferencetimevsmAP50', fontsize=20) plt.tight_layout() plt.savefig('inferencetimevsmAP50.png') ================================================ FILE: objectdetection-tricks/tricks_13.py ================================================ if type(x) in {list, tuple}: if idx == (len(self.model) - 1): if type(x[1]) is dict: print(f'layer id:{idx:>2} {m.type:>50} output shape:{", ".join([str(x_.size()) for x_ in x[1]["one2one"]])}') else: print(f'layer id:{idx:>2} {m.type:>50} output shape:{", ".join([str(x_.size()) for x_ in x[1]])}') else: print(f'layer id:{idx:>2} {m.type:>50} output shape:{", ".join([str(x_.size()) for x_ in x if x_ is not None])}') elif type(x) is dict: print(f'layer id:{idx:>2} {m.type:>50} output shape:{", ".join([str(x_.size()) for x_ in x["one2one"]])}') else: if not hasattr(m, 'backbone'): print(f'layer id:{idx:>2} {m.type:>50} output shape:{x.size()}') ================================================ FILE: objectdetection-tricks/tricks_14.py ================================================ import warnings warnings.filterwarnings('ignore') import os import numpy as np from prettytable import PrettyTable from ultralytics import YOLO from ultralytics.utils.torch_utils import model_info # BILIBILI UP 魔傀面具 # 验证参数官方详解链接:https://docs.ultralytics.com/modes/val/#usage-examples:~:text=of%20each%20category-,Arguments%20for%20YOLO%20Model%20Validation,-When%20validating%20YOLO def get_weight_size(path): stats = os.stat(path) return f'{stats.st_size / 1024 / 1024:.1f}' if __name__ == '__main__': model_path = 'runs/train/exp/weights/best.pt' model = YOLO(model_path) # 选择训练好的权重路径 result = model.val(data='/root/dataset/dataset_visdrone/data.yaml', split='val', # split可以选择train、val、test 根据自己的数据集情况来选择. imgsz=640, batch=16, project='runs/val', name='exp', ) if model.task == 'detect': # 仅目标检测任务适用 length = result.box.p.size model_names = list(result.names.values()) preprocess_time_per_image = result.speed['preprocess'] inference_time_per_image = result.speed['inference'] postprocess_time_per_image = result.speed['postprocess'] all_time_per_image = preprocess_time_per_image + inference_time_per_image + postprocess_time_per_image n_l, n_p, n_g, flops = model_info(model.model) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) model_info_table = PrettyTable() model_info_table.title = "Model Info" model_info_table.field_names = ["GFLOPs", "Parameters", "前处理时间/一张图", "推理时间/一张图", "后处理时间/一张图", "FPS(前处理+模型推理+后处理)", "FPS(推理)", "Model File Size"] model_info_table.add_row([f'{flops:.1f}', f'{n_p:,}', f'{preprocess_time_per_image / 1000:.6f}s', f'{inference_time_per_image / 1000:.6f}s', f'{postprocess_time_per_image / 1000:.6f}s', f'{1000 / all_time_per_image:.2f}', f'{1000 / inference_time_per_image:.2f}', f'{get_weight_size(model_path)}MB']) print(model_info_table) model_metrice_table = PrettyTable() model_metrice_table.title = "Model Metrice" model_metrice_table.field_names = ["Class Name", "Precision", "Recall", "F1-Score", "mAP50", "mAP75", "mAP50-95"] for idx in range(length): model_metrice_table.add_row([ model_names[idx], f"{result.box.p[idx]:.4f}", f"{result.box.r[idx]:.4f}", f"{result.box.f1[idx]:.4f}", f"{result.box.ap50[idx]:.4f}", f"{result.box.all_ap[idx, 5]:.4f}", # 50 55 60 65 70 75 80 85 90 95 f"{result.box.ap[idx]:.4f}" ]) model_metrice_table.add_row([ "all(平均数据)", f"{result.results_dict['metrics/precision(B)']:.4f}", f"{result.results_dict['metrics/recall(B)']:.4f}", f"{np.mean(result.box.f1[:length]):.4f}", f"{result.results_dict['metrics/mAP50(B)']:.4f}", f"{np.mean(result.box.all_ap[:length, 5]):.4f}", # 50 55 60 65 70 75 80 85 90 95 f"{result.results_dict['metrics/mAP50-95(B)']:.4f}" ]) print(model_metrice_table) with open(result.save_dir / 'paper_data.txt', 'w+') as f: f.write(str(model_info_table)) f.write('\n') f.write(str(model_metrice_table)) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) ================================================ FILE: objectdetection-tricks/tricks_15.py ================================================ import os, glob, cv2, tqdm from prettytable import PrettyTable RED, GREEN, BLUE, YELLOW, ORANGE, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[0m" image_postfix = ['jpg', 'png', 'bmp', 'tif'] images_folder_path = ['/home/dataset/dataset_visdrone/VisDrone2019-DET-train/images', '/home/dataset/dataset_visdrone/VisDrone2019-DET-val/images', '/home/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/images'] labels_folder_path = ['/home/dataset/dataset_visdrone/VisDrone2019-DET-train/labels', '/home/dataset/dataset_visdrone/VisDrone2019-DET-val/labels', '/home/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/labels'] classes = ['pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'] # classes = ['people', 'bicycle', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'] object_info = [32*32, 96*96] COLOR_LIST = [ (255, 0, 0), # 红色 (person) (0, 255, 0), # 绿色 (car) (0, 0, 255), # 蓝色 (bike) (255, 165, 0), # 橙色 (motorcycle) (255, 255, 0), # 黄色 (truck) (0, 255, 255), # 青色 (bus) (255, 0, 255), # 品红 (train) (255, 255, 255), # 白色 (airplane) (128, 0, 0), # 棕色 (dog) (0, 128, 0), # 深绿色 (cat) (0, 0, 128), # 深蓝色 (horse) (128, 128, 0), # 橄榄色 (sheep) (0, 128, 128), # 蓝绿色 (cow) (128, 0, 128), # 紫色 (elephant) (192, 192, 192), # 银色 (giraffe) (255, 99, 71), # 番茄色 (zebra) (0, 255, 127), # 春绿色 (monkey) (255, 105, 180), # 深粉色 (bird) (70, 130, 180), # 钢蓝色 (fish) ] def get_color_by_class(class_id): # 根据类别的索引返回固定颜色 return COLOR_LIST[class_id % len(COLOR_LIST)] # 确保索引不越界 def draw_detections(box, name, color, img): height, width, _ = img.shape xmin, ymin, xmax, ymax = list(map(int, list(box))) # 根据图像大小调整矩形框的线宽和文本的大小 line_thickness = max(1, int(min(height, width) / 400)) font_scale = min(height, width) / 1000 font_thickness = max(1, int(min(height, width) / 400)) # 根据图像大小调整文本的纵向位置 text_offset_y = int(min(height, width) / 100) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, line_thickness) cv2.putText(img, str(name), (xmin, ymin - text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), font_thickness, lineType=cv2.LINE_AA) return img def get_images_and_labels_path(images_folder_path, labels_folder_path): labels_path_list, labels_filename = [], {} for folder_path in labels_folder_path: glob_list = glob.glob(os.path.join(folder_path, '*.txt')) filename = {os.path.splitext(os.path.basename(i))[0]:i for i in glob_list} labels_path_list.extend(glob_list) labels_filename.update(filename) images_path_list, images_filename = [], {} for folder_path in images_folder_path: for p in image_postfix: glob_list = glob.glob(os.path.join(folder_path, f'*.{p}')) filename = {os.path.splitext(os.path.basename(i))[0]:i for i in glob_list} images_path_list.extend(glob_list) images_filename.update(filename) print(ORANGE + f'image_path_length:{len(images_filename)} label_path_length:{len(labels_filename)}') image_label_dict = {} for i in labels_filename: if i in images_filename: image_label_dict[labels_filename[i]] = images_filename[i] print(f'After matching. data_length:{len(image_label_dict)}' + RESET) return image_label_dict, labels_path_list def show_dataset_info(image_label_dict, visual_box=False, save_path='visual_box'): if visual_box and not os.path.exists(save_path): os.makedirs(save_path) classes_dict = {cls:{'s':0, 'm':0, 'l':0, 'num':0} for cls in classes} for label_path in tqdm.tqdm(image_label_dict): image_path = image_label_dict[label_path] image = cv2.imread(image_path) try: h, w = image.shape[:2] except: print(RED + f'{image_path} read failure. skip.' + RESET) with open(label_path) as f: label = list(map(lambda x:x.strip().split(), f.readlines())) for cls_id,x_c,y_c,width,height in label: classes_dict[classes[int(float(cls_id))]]['num'] += 1 width = float(width) * w height = float(height) * h obj_area = width * height if obj_area < object_info[0]: classes_dict[classes[int(float(cls_id))]]['s'] += 1 elif obj_area > object_info[1]: classes_dict[classes[int(float(cls_id))]]['l'] += 1 else: classes_dict[classes[int(float(cls_id))]]['m'] += 1 if visual_box: x_c, y_c = float(x_c) * w, float(y_c) * h x_min, y_min, x_max, y_max = x_c - width / 2, y_c - height / 2, x_c + width / 2, y_c + height / 2 image = draw_detections([x_min, y_min, x_max, y_max], classes[int(float(cls_id))], get_color_by_class(int(float(cls_id))), image) cv2.imwrite(os.path.join(save_path, os.path.basename(image_path)), image) # 统计总和 total_s = sum(v['s'] for v in classes_dict.values()) total_m = sum(v['m'] for v in classes_dict.values()) total_l = sum(v['l'] for v in classes_dict.values()) total_num = sum(v['num'] for v in classes_dict.values()) # 创建表格 table = PrettyTable() table.field_names = ["Category", "Small (s)", "Medium (m)", "Large (l)", "Total (num)"] # 添加每一行 for category, values in classes_dict.items(): s, m, l, num = values['s'], values['m'], values['l'], values['num'] row = [ category, f"{s} ({s/num:.1%})", f"{m} ({m/num:.1%})", f"{l} ({l/num:.1%})", num ] table.add_row(row) # 添加总计行 row_total = [ "All", f"{total_s} ({total_s/total_num:.1%})", f"{total_m} ({total_m/total_num:.1%})", f"{total_l} ({total_l/total_num:.1%})", total_num ] table.add_row(row_total) # 可选:左对齐类别列 table.align["Category"] = "l" # 打印表格 print(table) def remap_yolo_dataset_class(labels_path_list, delete_label=[0, 1, 3, 5]): classes = [] for label_path in tqdm.tqdm(labels_path_list, desc='scan dataset class'): with open(label_path) as f: label = list(map(lambda x:x.strip().split(), f.readlines())) for cls_id,x_c,y_c,width,height in label: classes.append(int(float(cls_id))) classes = sorted(list(set(classes))) filter_classes = list(sorted(set(classes) - set(delete_label))) print(ORANGE + f'now classes:{classes} delete classes:{delete_label} filter_classes:{filter_classes}' + RESET) for label_path in tqdm.tqdm(labels_path_list, desc='process dataset class'): with open(label_path) as f: label = list(map(lambda x:x.strip().split(), f.readlines())) new_label = [] for cls_id,x_c,y_c,width,height in label: if int(float(cls_id)) in delete_label: continue new_label.append(' '.join([str(filter_classes.index(int(float(cls_id)))),x_c,y_c,width,height])) with open(label_path, 'w+') as f: f.write('\n'.join(new_label)) if __name__ == '__main__': image_label_dict, labels_path_list = get_images_and_labels_path(images_folder_path, labels_folder_path) show_dataset_info(image_label_dict, visual_box=True) # remap_yolo_dataset_class(labels_path_list, delete_label=[0, 3]) ================================================ FILE: objectdetection-tricks/tricks_16.py ================================================ import json, tqdm, cv2, shutil, os import numpy as np import matplotlib.pyplot as plt # 1. 标签文件类别有问题,例如类别从1开始,不是从0开始。 # 2. image_id不匹配。 # 3. 标签的box异常。 SAVE_PATH = 'coco_visual' LABEL_COCO_PATH = '/Users/moguimianju/Downloads/data.json' PRED_COCO_PATH = '/Users/moguimianju/Downloads/predictions.json' SCORE_THR = 0.2 COLOR_LIST = [ (255, 0, 0), # 红色 (person) (0, 255, 0), # 绿色 (car) (0, 0, 255), # 蓝色 (bike) (255, 165, 0), # 橙色 (motorcycle) (255, 255, 0), # 黄色 (truck) (0, 255, 255), # 青色 (bus) (255, 0, 255), # 品红 (train) (255, 255, 255), # 白色 (airplane) (128, 0, 0), # 棕色 (dog) (0, 128, 0), # 深绿色 (cat) (0, 0, 128), # 深蓝色 (horse) (128, 128, 0), # 橄榄色 (sheep) (0, 128, 128), # 蓝绿色 (cow) (128, 0, 128), # 紫色 (elephant) (192, 192, 192), # 银色 (giraffe) (255, 99, 71), # 番茄色 (zebra) (0, 255, 127), # 春绿色 (monkey) (255, 105, 180), # 深粉色 (bird) (70, 130, 180), # 钢蓝色 (fish) ] def get_color_by_class(class_id): # 根据类别的索引返回固定颜色 return COLOR_LIST[class_id % len(COLOR_LIST)] # 确保索引不越界 def draw_detections(box, name, color, img): height, width, _ = img.shape xmin, ymin, xmax, ymax = list(map(int, list(box))) # 根据图像大小调整矩形框的线宽和文本的大小 line_thickness = max(1, int(min(height, width) / 400)) font_scale = min(height, width) / 1000 font_thickness = max(1, int(min(height, width) / 400)) # 根据图像大小调整文本的纵向位置 text_offset_y = int(min(height, width) / 100) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, line_thickness) cv2.putText(img, str(name), (xmin, ymin - text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), font_thickness, lineType=cv2.LINE_AA) return img if __name__ == '__main__': if os.path.exists(SAVE_PATH): shutil.rmtree(SAVE_PATH) os.makedirs(SAVE_PATH) with open(LABEL_COCO_PATH) as f: label = json.load(f) with open(PRED_COCO_PATH) as f: predictions = json.load(f) print(f'label json classes info:{label["categories"]}') label_dict = {} for data in label['images']: image_id = data['id'] label_dict[image_id] = {'file_name':data['file_name'], 'width':data['width'], 'height':data['height'], 'bbox_info':[]} for data in tqdm.tqdm(label['annotations'], desc='process annotations'): image_id = data['image_id'] label_dict[image_id]['bbox_info'].append({'class_id':data['category_id'], 'bbox':data['bbox']}) pred_classes_set = [] pred_dict = {} for data in tqdm.tqdm(predictions, desc='process predictions'): image_id = data['image_id'] if image_id not in pred_dict: pred_dict[image_id] = [] if data['category_id'] not in pred_classes_set: pred_classes_set.append(data['category_id']) if data['score'] < SCORE_THR: continue pred_dict[image_id].append({'class_id':data['category_id'], 'bbox':data['bbox'], 'score':data['score']}) print(f'predictions json classes set:{sorted(pred_classes_set)}') # print('-'*40 + 'label image_id' + '-'*40) # print(label_dict.keys()) # print('-'*40 + 'pred image_id' + '-'*40) # print(pred_dict.keys()) for image_id in tqdm.tqdm(label_dict, desc='process draw func'): if image_id not in pred_dict: print(f'image id:{image_id} not in predictions.json') continue label_img = np.ones((label_dict[image_id]['height'], label_dict[image_id]['width'], 3), dtype=np.uint8) * 255 pred_img = np.ones((label_dict[image_id]['height'], label_dict[image_id]['width'], 3), dtype=np.uint8) * 255 for bbox_info in label_dict[image_id]['bbox_info']: class_id = bbox_info['class_id'] x, y, w, h = bbox_info['bbox'] x_min, y_min, x_max, y_max = x - w / 2, y - h / 2, x + w / 2, y + h / 2 draw_detections([x_min, y_min, x_max, y_max], f'{class_id}', get_color_by_class(class_id), label_img) for bbox_info in pred_dict[image_id]: class_id = bbox_info['class_id'] score = bbox_info['score'] x, y, w, h = bbox_info['bbox'] x_min, y_min, x_max, y_max = x - w / 2, y - h / 2, x + w / 2, y + h / 2 draw_detections([x_min, y_min, x_max, y_max], f'{class_id} {score:.2f}', get_color_by_class(class_id), pred_img) plt.figure(figsize=(12, 8)) plt.subplot(1, 2, 1) plt.imshow(cv2.cvtColor(label_img, cv2.COLOR_BGR2RGB)) plt.axis('off') plt.title('label') plt.subplot(1, 2, 2) plt.imshow(cv2.cvtColor(pred_img, cv2.COLOR_BGR2RGB)) plt.axis('off') plt.title('predictions') plt.tight_layout() plt.savefig(f'{SAVE_PATH}/{image_id}.png') plt.close() ================================================ FILE: objectdetection-tricks/tricks_2.py ================================================ import torch, time, math, thop, tqdm, torchvision import torch.nn as nn import torch.nn.functional as F from torch.nn.modules.conv import _ConvNd from torch.nn.modules.utils import _pair from torch.nn.parameter import Parameter from prettytable import PrettyTable def time_synchronized(): # pytorch-accurate time if torch.cuda.is_available(): torch.cuda.synchronize() return time.time() def autopad(k, p=None, d=1): # kernel, padding, dilation # Pad to 'same' shape outputs if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv2D(nn.Module): def __init__(self, inc, ouc, kernel_size, g=1): super().__init__() self.conv = nn.Conv2d(inc, ouc, kernel_size, padding=autopad(kernel_size), groups=g) self.bn = nn.BatchNorm2d(num_features=ouc) self.act = nn.ReLU(inplace=True) def forward(self, x): return self.act(self.bn(self.conv(x))) def __str__(self): return 'Conv2D' class DConv2D(nn.Module): def __init__(self, inc, ouc, kernel_size): super().__init__() self.pw = Conv2D(inc, ouc, 1) self.dw = Conv2D(ouc, ouc, kernel_size, g=ouc) def forward(self, x): return self.dw(self.pw(x)) def __str__(self): return 'Depth-Conv2D' class GhostConv2D(nn.Module): def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3): super().__init__() self.oup = oup init_channels = math.ceil(oup / ratio) new_channels = init_channels*(ratio-1) self.primary_conv = Conv2D(inp, init_channels, kernel_size) self.cheap_operation = Conv2D(init_channels, new_channels, dw_size, g=init_channels) def forward(self, x): x1 = self.primary_conv(x) x2 = self.cheap_operation(x1) out = torch.cat([x1,x2], dim=1) return out[:,:self.oup,:,:] def __str__(self): return 'Ghost-Conv2D' class GSConv(nn.Module): # GSConv https://github.com/AlanLi1997/slim-neck-by-gsconv def __init__(self, c1, c2, k=1, s=1, g=1): super().__init__() c_ = c2 // 2 self.cv1 = Conv2D(c1, c_, k, g) self.cv2 = Conv2D(c_, c_, 5, c_) def forward(self, x): x1 = self.cv1(x) x2 = torch.cat((x1, self.cv2(x1)), 1) # shuffle # y = x2.reshape(x2.shape[0], 2, x2.shape[1] // 2, x2.shape[2], x2.shape[3]) # y = y.permute(0, 2, 1, 3, 4) # return y.reshape(y.shape[0], -1, y.shape[3], y.shape[4]) b, n, h, w = x2.data.size() b_n = b * n // 2 y = x2.reshape(b_n, 2, h * w) y = y.permute(1, 0, 2) y = y.reshape(2, -1, n // 2, h, w) return torch.cat((y[0], y[1]), 1) def __str__(self): return 'GSConv2D' class DSConv(_ConvNd): def __init__(self, in_channels, out_channels, kernel_size, block_size=32, stride=1, padding=None, dilation=1, groups=1, padding_mode='zeros', bias=False, KDSBias=False, CDS=False): padding = _pair(autopad(kernel_size, padding, dilation)) kernel_size = _pair(kernel_size) stride = _pair(stride) dilation = _pair(dilation) blck_numb = math.ceil(((in_channels)/(block_size*groups))) super(DSConv, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias, padding_mode) # KDS weight From Paper self.intweight = torch.Tensor(out_channels, in_channels, *kernel_size) self.alpha = torch.Tensor(out_channels, blck_numb, *kernel_size) # KDS bias From Paper self.KDSBias = KDSBias self.CDS = CDS if KDSBias: self.KDSb = torch.Tensor(out_channels, blck_numb, *kernel_size) if CDS: self.CDSw = torch.Tensor(out_channels) self.CDSb = torch.Tensor(out_channels) self.reset_parameters() def get_weight_res(self): # Include expansion of alpha and multiplication with weights to include in the convolution layer here alpha_res = torch.zeros(self.weight.shape).to(self.alpha.device) # Include KDSBias if self.KDSBias: KDSBias_res = torch.zeros(self.weight.shape).to(self.alpha.device) # Handy definitions: nmb_blocks = self.alpha.shape[1] total_depth = self.weight.shape[1] bs = total_depth//nmb_blocks llb = total_depth-(nmb_blocks-1)*bs # Casting the Alpha values as same tensor shape as weight for i in range(nmb_blocks): length_blk = llb if i==nmb_blocks-1 else bs shp = self.alpha.shape # Notice this is the same shape for the bias as well to_repeat=self.alpha[:, i, ...].view(shp[0],1,shp[2],shp[3]).clone() repeated = to_repeat.expand(shp[0], length_blk, shp[2], shp[3]).clone() alpha_res[:, i*bs:(i*bs+length_blk), ...] = repeated.clone() if self.KDSBias: to_repeat = self.KDSb[:, i, ...].view(shp[0], 1, shp[2], shp[3]).clone() repeated = to_repeat.expand(shp[0], length_blk, shp[2], shp[3]).clone() KDSBias_res[:, i*bs:(i*bs+length_blk), ...] = repeated.clone() if self.CDS: to_repeat = self.CDSw.view(-1, 1, 1, 1) repeated = to_repeat.expand_as(self.weight) print(repeated.shape) # Element-wise multiplication of alpha and weight weight_res = torch.mul(alpha_res, self.weight) if self.KDSBias: weight_res = torch.add(weight_res, KDSBias_res) return weight_res def forward(self, input): # Get resulting weight #weight_res = self.get_weight_res() # Returning convolution return F.conv2d(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) class DSConv2D(Conv2D): def __init__(self, inc, ouc, kernel_size, g=1): super().__init__(inc, ouc, kernel_size, g) self.conv = DSConv(inc, ouc, kernel_size) def __str__(self): return 'DSConv2D' class Partial_conv3(nn.Module): def __init__(self, dim, kernel_size, n_div=4, forward='split_cat'): super().__init__() self.dim_conv3 = dim // n_div self.dim_untouched = dim - self.dim_conv3 self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, kernel_size, 1, autopad(kernel_size), bias=False) if forward == 'slicing': self.forward = self.forward_slicing elif forward == 'split_cat': self.forward = self.forward_split_cat else: raise NotImplementedError def forward_slicing(self, x): # only for inference x = x.clone() # !!! Keep the original input intact for the residual connection later x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :]) return x def forward_split_cat(self, x): # for training/inference x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1) x1 = self.partial_conv3(x1) x = torch.cat((x1, x2), 1) return x class PConv(Conv2D): def __init__(self, inc, ouc, kernel_size, g=1): super().__init__(inc, ouc, kernel_size, g) self.conv = Partial_conv3(inc, kernel_size) def __str__(self): return 'PConv2D-FasterNet' class DCNV2(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, groups=1, act=True, dilation=1, deformable_groups=1): super(DCNV2, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = (stride, stride) self.padding = (autopad(kernel_size, padding), autopad(kernel_size, padding)) self.dilation = (dilation, dilation) self.groups = groups self.deformable_groups = deformable_groups self.weight = nn.Parameter( torch.empty(out_channels, in_channels, *self.kernel_size) ) self.bias = nn.Parameter(torch.empty(out_channels)) out_channels_offset_mask = (self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]) self.conv_offset_mask = nn.Conv2d( self.in_channels, out_channels_offset_mask, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True, ) self.bn = nn.BatchNorm2d(out_channels) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) self.reset_parameters() def forward(self, x): offset_mask = self.conv_offset_mask(x) o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) x = torch.ops.torchvision.deform_conv2d( x, self.weight, offset, mask, self.bias, self.stride[0], self.stride[1], self.padding[0], self.padding[1], self.dilation[0], self.dilation[1], self.groups, self.deformable_groups, True ) x = self.bn(x) x = self.act(x) return x def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k std = 1. / math.sqrt(n) self.weight.data.uniform_(-std, std) self.bias.data.zero_() self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() def __str__(self): return 'DCNV2' from ops_dcnv3.modules import DCNv3 class DCNV3(Conv2D): def __init__(self, inc, ouc, k=1, s=1, p=None, g=1, d=1, act=True): super().__init__(inc, ouc, k, g) self.conv = DCNv3(inc, kernel_size=k, stride=s, group=g, dilation=d) def __str__(self): return 'DCNV3' def forward(self, x): x = x.permute(0, 2, 3, 1) x = self.conv(x) x = x.permute(0, 3, 1, 2) return self.act(self.bn(x)) if __name__ == '__main__': warmup, test_times = 1000, 3000 bs, h, w = 8, 256, 256 inc, ouc, kernel_size = 128, 128, 3 cuda, half = True, True module_list = [ Conv2D(inc, ouc, kernel_size), DConv2D(inc, ouc, kernel_size), GhostConv2D(inc, ouc, kernel_size=1, ratio=2, dw_size=kernel_size), GSConv(inc, ouc, kernel_size), DSConv2D(inc, ouc, kernel_size), PConv(inc, ouc, kernel_size), DCNV2(inc, ouc, kernel_size), DCNV3(inc, ouc, kernel_size) ] device = torch.device("cuda:0") if cuda else torch.device("cpu") inputs = torch.randn((bs, inc, h, w)).to(device) if half: inputs = inputs.half() table = PrettyTable() table.title = 'Conv Family Speed' table.field_names = ['Name', 'All_Time', 'Mean_Time', 'FPS', "FLOPs", "Params"] for module in module_list: module = module.to(device) if half: module = module.half() for i in tqdm.tqdm(range(warmup), desc=f'{str(module)} Warmup....'): module(inputs) all_time = 0 for i in tqdm.tqdm(range(test_times), desc=f'{str(module)} Calculate Speed....'): begin = time_synchronized() module(inputs) all_time += time_synchronized() - begin FLOPs, Params = thop.profile(module, inputs=(inputs, ), verbose=False) FLOPs, Params = thop.clever_format([FLOPs, Params], "%.3f") # print(f'{str(module)} all_time:{all_time:.5f} mean_time:{all_time / test_times:.5f} fps:{1 / (all_time / test_times)} FLOPs:{FLOPs} Params:{Params}') table.add_row([str(module), f'{all_time:.5f}', f'{all_time / test_times:.5f}', f'{1 / (all_time / test_times)}', f'{FLOPs}', f'{Params}']) print(table) ================================================ FILE: objectdetection-tricks/tricks_3.py ================================================ def feature_visualization(x, module_type, stage, n=32, save_dir=Path('runs/detect/exp')): """ x: Features to be visualized module_type: Module type stage: Module stage within model n: Maximum number of feature maps to plot save_dir: Directory to save results """ if 'Detect' not in module_type: batch, channels, height, width = x.shape # batch, channels, height, width if height > 1 and width > 1: f = save_dir / f"stage{stage}_{module_type.split('.')[-1]}_features.png" # filename blocks = torch.chunk(x[0].cpu(), channels, dim=0) # select batch index 0, block by channels n = min(n, channels) # number of plots fig, ax = plt.subplots(math.ceil(n / 8), 8, tight_layout=True) # 8 rows x n/8 cols ax = ax.ravel() plt.subplots_adjust(wspace=0.05, hspace=0.05) for i in range(n): block = blocks[i].squeeze().detach().numpy() block = (block - np.min(block)) / (np.max(block) - np.min(block)) temp = np.array(block * 255.0, dtype=np.uint8) temp = cv2.applyColorMap(temp, cv2.COLORMAP_JET) ax[i].imshow(temp, cmap=plt.cm.jet) # cmap='gray' ax[i].axis('off') LOGGER.info(f'Saving {f}... ({n}/{channels})') plt.savefig(f, dpi=300, bbox_inches='tight') plt.close() np.save(str(f.with_suffix('.npy')), x[0].cpu().numpy()) # npy save ================================================ FILE: objectdetection-tricks/tricks_4.py ================================================ import os import cv2 import json from tqdm import tqdm from sklearn.model_selection import train_test_split import argparse parser = argparse.ArgumentParser() parser.add_argument('--root_dir', default='/home/hjj/Desktop/dataset/dataset_seaship',type=str, help="root path of images and labels, include ./images and ./labels and classes.txt") parser.add_argument('--save_path', type=str,default='instances_val2017.json', help="if not split the dataset, give a path to a json file") arg = parser.parse_args() def yolo2coco(arg): root_path = arg.root_dir print("Loading data from ",root_path) assert os.path.exists(root_path) originLabelsDir = os.path.join(root_path, 'labels/test') originImagesDir = os.path.join(root_path, 'images/test') with open(os.path.join(root_path, 'classes.txt')) as f: classes = list(map(lambda x:x.strip(), f.readlines())) # images dir name indexes = os.listdir(originImagesDir) dataset = {'categories': [], 'annotations': [], 'images': []} for i, cls in enumerate(classes, 0): dataset['categories'].append({'id': i, 'name': cls, 'supercategory': 'mark'}) # 标注的id ann_id_cnt = 0 for k, index in enumerate(tqdm(indexes)): # 支持 png jpg 格式的图片。 txtFile = index.replace('images','txt').replace('.jpg','.txt').replace('.png','.txt') # 读取图像的宽和高 im = cv2.imread(os.path.join(originImagesDir, index)) height, width, _ = im.shape # 添加图像的信息 if not os.path.exists(os.path.join(originLabelsDir, txtFile)): # 如没标签,跳过,只保留图片信息。 continue dataset['images'].append({'file_name': index, 'id': int(index[:-4]) if index[:-4].isnumeric() else index[:-4], 'width': width, 'height': height}) with open(os.path.join(originLabelsDir, txtFile), 'r') as fr: labelList = fr.readlines() for label in labelList: label = label.strip().split() x = float(label[1]) y = float(label[2]) w = float(label[3]) h = float(label[4]) # convert x,y,w,h to x1,y1,x2,y2 H, W, _ = im.shape x1 = (x - w / 2) * W y1 = (y - h / 2) * H x2 = (x + w / 2) * W y2 = (y + h / 2) * H # 标签序号从0开始计算, coco2017数据集标号混乱,不管它了。 cls_id = int(label[0]) width = max(0, x2 - x1) height = max(0, y2 - y1) dataset['annotations'].append({ 'area': width * height, 'bbox': [x1, y1, width, height], 'category_id': cls_id, 'id': ann_id_cnt, 'image_id': int(index[:-4]) if index[:-4].isnumeric() else index[:-4], 'iscrowd': 0, # mask, 矩形是从左上角点按顺时针的四个顶点 'segmentation': [[x1, y1, x2, y1, x2, y2, x1, y2]] }) ann_id_cnt += 1 # 保存结果 with open(arg.save_path, 'w') as f: json.dump(dataset, f) print('Save annotation to {}'.format(arg.save_path)) if __name__ == "__main__": yolo2coco(arg) ================================================ FILE: objectdetection-tricks/tricks_5.py ================================================ import cv2 import numpy as np import matplotlib.pylab as plt from segment_anything import SamPredictor, sam_model_registry def show_mask(mask, ax, random_color=False): if random_color: color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) else: color = np.array([30/255, 144/255, 255/255, 0.6]) h, w = mask.shape[-2:] mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) ax.imshow(mask_image) def show_points(coords, labels, ax, marker_size=375): pos_points = coords[labels==1] neg_points = coords[labels==0] ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) def show_box(box, ax): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) class Select_RoI: def __init__(self, img) -> None: self.mouseWindowName = 'Select_RoI' self.last_img, self.cur_img = img.copy(), img.copy() self.point_lefttop, self.point_rightbottom, self.center_point, self.count = [], [], [], 0 cv2.namedWindow(self.mouseWindowName, cv2.WINDOW_NORMAL) cv2.setMouseCallback(self.mouseWindowName, self.on_mouse) while True: cv2.imshow(self.mouseWindowName, self.cur_img) key = cv2.waitKey(5) if key == 13: # 按回车键13表示完成绘制 break elif key == 99: # 按键盘c退回上一次的状态 self.clear() elif key == 32: self.confirm() def on_mouse(self, event, x, y, flags, param): if event == cv2.EVENT_LBUTTONDOWN: if len(self.point_lefttop) == len(self.point_rightbottom): self.point_lefttop.append([x, y]) cv2.circle(self.cur_img, (x, y), 5, (0, 255, 0), -1) else: self.point_rightbottom.append([x, y]) cv2.circle(self.cur_img, (x, y), 5, (0, 255, 0), -1) cv2.rectangle(self.cur_img, (tuple(self.point_lefttop[-1])), (tuple(self.point_rightbottom[-1])), (0, 0, 255), 3) cv2.imshow(self.mouseWindowName, self.cur_img) if event == cv2.EVENT_RBUTTONDOWN: cv2.circle(self.cur_img, (x, y), 5, (255, 0, 0), -1) self.center_point.append([x, y]) def clear(self): if len(self.center_point) == len(self.point_lefttop) == len(self.point_rightbottom): min_len = len(self.center_point) - 1 else: min_len = np.min([len(self.center_point), len(self.point_lefttop), len(self.point_rightbottom)]) if len(self.center_point) > min_len: self.center_point.pop(-1) if len(self.point_lefttop) > min_len: self.point_lefttop.pop(-1) if len(self.point_rightbottom) > min_len: self.point_rightbottom.pop(-1) if len(self.center_point) == len(self.point_lefttop) == len(self.point_rightbottom): self.count = min_len self.cur_img = self.last_img.copy() else: raise "center_point point_lefttop point_rightbottom not equal." print(f'point_lefttop:{self.point_lefttop}\npoint_rightbottom:{self.point_rightbottom}\ncenter_point:{self.center_point}\ncount:{self.count}') def confirm(self): self.last_img = self.cur_img.copy() if len(self.center_point) == len(self.point_lefttop) == len(self.point_rightbottom): self.count = len(self.center_point) else: raise "center_point point_lefttop point_rightbottom not equal." print(f'point_lefttop:{self.point_lefttop}\npoint_rightbottom:{self.point_rightbottom}\ncenter_point:{self.center_point}\ncount:{self.count}') def get_result(self): return np.array([np.array([*i, *j]) for i, j in zip(self.point_lefttop, self.point_rightbottom)]), np.array([np.array(i) for i in self.center_point]) sam = sam_model_registry["vit_b"](checkpoint="sam_vit_b_01ec64.pth") predictor = SamPredictor(sam) path = '1.jpg' image = cv2.imread(path) roi = Select_RoI(image.copy()) box, point = roi.get_result() label = np.array([0 for i in point]) predictor.set_image(image) if point.shape[0] != 0: masks, scores, logits = predictor.predict(box=box, point_coords=point, point_labels=label) else: masks, scores, logits = predictor.predict(box=box) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) for i, (mask, score) in enumerate(zip(masks, scores)): plt.figure(figsize=(10,10)) plt.imshow(image) show_mask(mask, plt.gca()) if point.shape[0] != 0: show_points(point, label, plt.gca()) plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18) plt.axis('off') plt.tight_layout() plt.show() ================================================ FILE: objectdetection-tricks/tricks_6.py ================================================ import pkg_resources as pkg def check_version(current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False, verbose=False): # Check version vs. required version current, minimum = (pkg.parse_version(x) for x in (current, minimum)) result = (current == minimum) if pinned else (current >= minimum) # bool return result def set_seeds(seed=0, deterministic=False): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # for Multi-GPU, exception safe # torch.backends.cudnn.benchmark = True # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287 if deterministic and check_version(torch.__version__, '1.12.0'): # https://github.com/ultralytics/yolov5/pull/8213 torch.use_deterministic_algorithms(True) torch.backends.cudnn.deterministic = True os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' os.environ['PYTHONHASHSEED'] = str(seed) ================================================ FILE: objectdetection-tricks/tricks_7.py ================================================ import warnings warnings.filterwarnings('ignore') import argparse import logging import math import os import random import time import sys from copy import deepcopy from pathlib import Path from threading import Thread import numpy as np import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.optim.lr_scheduler as lr_scheduler import torch.utils.data import yaml from torch.cuda import amp from torch.nn.parallel import DistributedDataParallel as DDP from tqdm import tqdm from utils.torch_utils import select_device from models.common import DetectMultiBackend def get_weight_size(path): stats = os.stat(path) return f'{stats.st_size / 1024 / 1024:.1f}' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default='', help='trained weights path') parser.add_argument('--batch', type=int, default=1, help='total batch size for all GPUs') parser.add_argument('--imgs', nargs='+', type=int, default=[640, 640], help='[height, width] image sizes') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--warmup', default=200, type=int, help='warmup time') parser.add_argument('--testtime', default=1000, type=int, help='test time') parser.add_argument('--half', action='store_true', default=False, help='fp16 mode.') opt = parser.parse_args() device = select_device(opt.device, batch_size=opt.batch) # Model weights = opt.weights pretrained = weights.endswith('.pt') if pretrained: model = DetectMultiBackend(weights, device=device) print(f'Loaded {weights}') # report else: assert weights.endswith('.pt'), "compress need weights." example_inputs = torch.randn((opt.batch, 3, *opt.imgs)).to(device) if opt.half: model = model.half() example_inputs = example_inputs.half() print('begin warmup...') for i in tqdm(range(opt.warmup), desc='warmup....'): model(example_inputs) print('begin test latency...') time_arr = [] for i in tqdm(range(opt.testtime), desc='test latency....'): if device.type == 'cuda': torch.cuda.synchronize() start_time = time.time() model(example_inputs) if device.type == 'cuda': torch.cuda.synchronize() end_time = time.time() time_arr.append(end_time - start_time) std_time = np.std(time_arr) infer_time_per_image = np.sum(time_arr) / (opt.testtime * opt.batch) print(f'model weights:{opt.weights} size:{get_weight_size(opt.weights)}M (bs:{opt.batch})Latency:{infer_time_per_image:.5f}s +- {std_time:.5f}s fps:{1 / infer_time_per_image:.1f}') ================================================ FILE: objectdetection-tricks/tricks_8.py ================================================ import warnings warnings.filterwarnings('ignore') import argparse import logging import math import os import random import time import sys from copy import deepcopy from pathlib import Path from threading import Thread import numpy as np import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.optim.lr_scheduler as lr_scheduler import torch.utils.data import yaml from torch.cuda import amp from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from models.experimental import attempt_load from models.yolo import Model from utils.torch_utils import select_device def get_weight_size(path): stats = os.stat(path) return f'{stats.st_size / 1024 / 1024:.1f}' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default='', help='trained weights path') parser.add_argument('--batch', type=int, default=1, help='total batch size for all GPUs') parser.add_argument('--imgs', nargs='+', type=int, default=[640, 640], help='[height, width] image sizes') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--warmup', default=200, type=int, help='warmup time') parser.add_argument('--testtime', default=1000, type=int, help='test time') parser.add_argument('--half', action='store_true', default=False, help='fp16 mode.') opt = parser.parse_args() device = select_device(opt.device, batch_size=opt.batch) # Model weights = opt.weights pretrained = weights.endswith('.pt') if pretrained: model = torch.load(weights, map_location=device) if model['ema']: model = model['ema'].float() else: model = model['model'].float() model.fuse() model.info(img_size=opt.imgs[0]) print(f'Loaded {weights}') # report else: assert weights.endswith('.pt'), "compress need weights." example_inputs = torch.randn((opt.batch, 3, *opt.imgs)).to(device) if opt.half: model = model.half() example_inputs = example_inputs.half() print('begin warmup...') for i in tqdm(range(opt.warmup), desc='warmup....'): model(example_inputs) print('begin test latency...') time_arr = [] for i in tqdm(range(opt.testtime), desc='test latency....'): if device.type == 'cuda': torch.cuda.synchronize() start_time = time.time() model(example_inputs) if device.type == 'cuda': torch.cuda.synchronize() end_time = time.time() time_arr.append(end_time - start_time) mean_time, std_time = np.mean(time_arr), np.std(time_arr) print(f'model weights:{opt.weights} size:{get_weight_size(opt.weights)}M Latency:{mean_time:.5f}s +- {std_time:.5f}s fps:{1 / mean_time:.1f}') ================================================ FILE: objectdetection-tricks/tricks_9.py ================================================ import torch, time, math, thop, tqdm, torchvision import torch.nn as nn import torch.nn.functional as F from prettytable import PrettyTable import numpy as np def time_synchronized(): # pytorch-accurate time if torch.cuda.is_available(): torch.cuda.synchronize() return time.time() def fuse_conv_and_bn(conv, bn): """Fuse Conv2d() and BatchNorm2d() layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/.""" fusedconv = ( nn.Conv2d( conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, dilation=conv.dilation, groups=conv.groups, bias=True, ) .requires_grad_(False) .to(conv.weight.device) ) # Prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) # Prepare spatial bias b_conv = torch.zeros(conv.weight.shape[0], device=conv.weight.device) if conv.bias is None else conv.bias b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv def autopad(k, p=None, d=1): # kernel, padding, dilation """Pad to 'same' shape outputs.""" if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv(nn.Module): """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation).""" default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): """Initialize Conv layer with given arguments including activation.""" super().__init__() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): """Apply convolution, batch normalization and activation to input tensor.""" return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): """Perform transposed convolution of 2D data.""" return self.act(self.conv(x)) class Bottleneck(nn.Module): """Standard bottleneck.""" def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and expansion. """ super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, k[0], 1) self.cv2 = Conv(c_, c2, k[1], 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): """'forward()' applies the YOLO FPN to input data.""" return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) ################################# YOLOV7-ELAN ################################# class ELAN(nn.Module): def __init__(self, inc, ouc, hidc, act=True): super(ELAN, self).__init__() self.conv1 = Conv(inc, hidc, k=1, act=act) self.conv2 = Conv(inc, hidc, k=1, act=act) self.conv3 = Conv(hidc, hidc, k=3, act=act) self.conv4 = Conv(hidc, hidc, k=3, act=act) self.conv5 = Conv(hidc * 4, ouc, k=1, act=act) def forward(self, x): x1, x2 = self.conv1(x), self.conv2(x) x3 = self.conv3(x2) x4 = self.conv4(x3) x_concat = torch.concat([x1, x2, x3, x4], dim=1) x_final = self.conv5(x_concat) return x_final def __str__(self): return 'ELAN' ################################# YOLOV8-C2f ################################# class C2f(nn.Module): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups, expansion. """ super().__init__() self.c = int(c2 * e) # hidden channels self.cv1 = Conv(c1, 2 * self.c, 1, 1) self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2) self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)) def forward(self, x): """Forward pass through C2f layer.""" y = list(self.cv1(x).chunk(2, 1)) y.extend(m(y[-1]) for m in self.m) return self.cv2(torch.cat(y, 1)) def forward_split(self, x): """Forward pass using split() instead of chunk().""" y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in self.m) return self.cv2(torch.cat(y, 1)) def __str__(self): return 'C2f' ################################# YOLOV5-C3 ################################# class C3(nn.Module): """CSP Bottleneck with 3 convolutions.""" def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values.""" super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n))) def forward(self, x): """Forward pass through the CSP bottleneck with 2 convolutions.""" return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) def __str__(self): return 'C3' ################################# YOLOV9-RepNCSPELAN4 ################################# class RepConvN(nn.Module): """RepConv is a basic rep-style block, including training and deploy status This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py """ default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): super().__init__() assert k == 3 and p == 1 self.g = g self.c1 = c1 self.c2 = c2 self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() self.bn = None self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False) self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) def forward_fuse(self, x): """Forward process""" return self.act(self.conv(x)) def forward(self, x): """Forward process""" id_out = 0 if self.bn is None else self.bn(x) return self.act(self.conv1(x) + self.conv2(x) + id_out) def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) kernelid, biasid = self._fuse_bn_tensor(self.bn) return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _avg_to_3x3_tensor(self, avgp): channels = self.c1 groups = self.g kernel_size = avgp.kernel_size input_dim = channels // groups k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 return k def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 if isinstance(branch, Conv): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps elif isinstance(branch, nn.BatchNorm2d): if not hasattr(self, 'id_tensor'): input_dim = self.c1 // self.g kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32) for i in range(self.c1): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def fuse_convs(self): if hasattr(self, 'conv'): return kernel, bias = self.get_equivalent_kernel_bias() self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels, out_channels=self.conv1.conv.out_channels, kernel_size=self.conv1.conv.kernel_size, stride=self.conv1.conv.stride, padding=self.conv1.conv.padding, dilation=self.conv1.conv.dilation, groups=self.conv1.conv.groups, bias=True).requires_grad_(False) self.conv.weight.data = kernel self.conv.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('conv1') self.__delattr__('conv2') if hasattr(self, 'nm'): self.__delattr__('nm') if hasattr(self, 'bn'): self.__delattr__('bn') if hasattr(self, 'id_tensor'): self.__delattr__('id_tensor') class RepNBottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, act=True): # ch_in, ch_out, shortcut, kernels, groups, expand super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = RepConvN(c1, c_, k[0], 1, act=act) self.cv2 = Conv(c_, c2, k[1], 1, g=g, act=act) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class RepNCSP(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act=True): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1, act=act) self.cv2 = Conv(c1, c_, 1, 1, act=act) self.cv3 = Conv(2 * c_, c2, 1, act=act) # optional act=FReLU(c2) self.m = nn.Sequential(*(RepNBottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n))) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) class RepNCSPELAN4(nn.Module): # csp-elan def __init__(self, c1, c2, c3, c4, c5=1, act=True): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() self.c = c3//2 self.cv1 = Conv(c1, c3, 1, 1, act=act) self.cv2 = nn.Sequential(RepNCSP(c3//2, c4, c5, act=act), Conv(c4, c4, 3, 1, act=act)) self.cv3 = nn.Sequential(RepNCSP(c4, c4, c5, act=act), Conv(c4, c4, 3, 1, act=act)) self.cv4 = Conv(c3+(2*c4), c2, 1, 1, act=act) def forward(self, x): y = list(self.cv1(x).chunk(2, 1)) y.extend((m(y[-1])) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) def forward_split(self, x): y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) def __str__(self): return 'RepNCSPELAN' class RepNCSPELAN4_Att(nn.Module): # csp-elan def __init__(self, c1, c2, c3, c4, c5=1, act=True): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() self.c = c3//2 self.cv1 = Conv(c1, c3, 1, 1, act=act) self.cv2 = nn.Sequential(RepNCSP(c3//2, c4, c5, act=act), Conv(c4, c4, 3, 1, act=act)) self.cv3 = nn.Sequential(RepNCSP(c4, c4, c5, act=act), Conv(c4, c4, 3, 1, act=act)) self.cv4 = Conv(c3+(2*c4), c2, 1, 1, act=act) def forward(self, x): y = list(self.cv1(x).chunk(2, 1)) y.extend((m(y[-1])) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) def forward_split(self, x): y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) def __str__(self): return 'RepNCSPELAN_Att' if __name__ == '__main__': warmup, test_times = 1000, 2000 bs, h, w = 1, 128, 128 channel = 256 cuda, half = True, False module_list = [ C3(channel, channel), ELAN(channel, channel, channel // 2), C2f(channel, channel), RepNCSPELAN4(channel, channel, channel // 2, channel // 4, 1), ] device = torch.device("cuda:0") if cuda else torch.device("cpu") inputs = torch.randn((bs, channel, h, w)).to(device) if half: inputs = inputs.half() table = PrettyTable() table.title = 'Yolo Block Family Speed' table.field_names = ['Name', 'All_Time', 'Mean_Time', 'FPS', "FLOPs", "Params"] for module in module_list: for m in module.modules(): if isinstance(m, (Conv,)) and hasattr(m, "bn"): m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv delattr(m, "bn") # remove batchnorm m.forward = m.forward_fuse # update forward if isinstance(m, RepConvN): m.fuse_convs() m.forward = m.forward_fuse # update forward module = module.to(device) if half: module = module.half() for i in tqdm.tqdm(range(warmup), desc=f'{str(module)} Warmup....'): module(inputs) all_time = 0 for i in tqdm.tqdm(range(test_times), desc=f'{str(module)} Calculate Speed....'): begin = time_synchronized() module(inputs) all_time += time_synchronized() - begin FLOPs, Params = thop.profile(module, inputs=(inputs, ), verbose=False) FLOPs, Params = thop.clever_format([FLOPs, Params], "%.3f") # print(f'{str(module)} all_time:{all_time:.5f} mean_time:{all_time / test_times:.5f} fps:{1 / (all_time / test_times)} FLOPs:{FLOPs} Params:{Params}') table.add_row([str(module), f'{all_time:.5f}', f'{all_time / test_times:.5f}', f'{1 / (all_time / test_times)}', f'{FLOPs}', f'{Params}']) print(table) ================================================ FILE: readme.md ================================================ # Object Detection Script 这个项目主要是提供一些关于目标检测的代码和改进思路参考. ### [BiliBili视频指南](https://github.com/z1069614715/objectdetection_script/blob/master/bilibili-guide.md) # Project <需要入手请加企鹅1615905974/1069614715,如添加不上可bilibili私聊直发企鹅号码,最好好友请求也设置不需要验证就可以加上> 1. 基于Ultralytics的yolov8、yolov10改进项目.(69.9¥) [目前已有的改进方案和更新详细公告](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/yolov8v10-project.md) 项目简单介绍,详情请看项目详解. 1. 提供修改好的代码和每个改进点的配置文件,相当于积木都给大家准备好,大家只需要做实验和搭积木(修改yaml配置文件组合创新点)即可,装好环境即可使用. 2. 后续的改进方案都会基于这个项目更新进行发布,在群公告进行更新百度云链接. 3. 购买了本项目的都会赠送yolov5-PAGCP通道剪枝算法代码和相关实验参数命令. 4. 购买后进YOLOV8V10交流群(代码视频均在群公告),群里可交流代码和论文相关,目前1群2群已满,现在进的是3群,气氛活跃. 5. 项目因为(价格问题)不附带一对一私人答疑服务,群里附带答疑服务,平时我有时间都会回复群里部分问题. 6. 里面配备使用说明(部分改进点使用复杂度高、二次创新、原创的模块都会有对应的视频进行说明) 2. 基于Ultralytics的yolo11、yolo12改进项目.(69.9¥) [目前已有的改进方案和更新详细公告](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/yolov11-project.md) 项目简单介绍,详情请看项目详解. 1. 提供修改好的代码和每个改进点的配置文件,相当于积木都给大家准备好,大家只需要做实验和搭积木(修改yaml配置文件组合创新点)即可,装好环境即可使用. 2. 后续的改进方案都会基于这个项目更新进行发布,在群公告进行更新百度云链接. 3. 购买了本项目的都会赠送yolov5-PAGCP通道剪枝算法代码和相关实验参数命令. 4. 购买后进YOLOV11交流群(代码视频均在群公告),群里可交流代码和论文相关,气氛活跃. 5. 项目因为(价格问题)不附带一对一私人答疑服务,群里附带答疑服务,平时我有时间都会回复群里部分问题. 6. 里面配备使用说明(部分改进点使用复杂度高、二次创新、原创的模块都会有对应的视频进行说明)。 7. 包含yolo12-目标检测、实例分割、关键点检测、旋转目标检测、分类配置文件,可以通过仅修改配置文件的方式改进yolo12。 3. 基于YOLOV5,YOLOV7的(剪枝+知识蒸馏)项目.(129.9¥)[项目详解](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/yolov5v7-light.md) 1. 模型轻量化,部署必备之一! 2. 项目里面配套几个剪枝和蒸馏的示例,并且都配有视频讲解,供大家理解如何进行剪枝和蒸馏. 3. 购买后进YOLOV5V7轻量化交流群(代码视频均在群公告),轻量化问题都可在群交流,因为剪枝蒸馏问题比较困难,所以剪枝蒸馏问题可以群里提问,我都会群里回复相关问题. 4. 基于Ultralytics的RT-DETR(CVPR2024)改进项目.(89.9¥) [目前已有的改进方案和更新详细公告](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/rtdetr-project.md) 项目简单介绍,详情请看项目详解. 1. 提供修改好的代码和每个改进点的配置文件,相当于积木都给大家准备好,大家只需要做实验和搭积木(修改yaml配置文件组合创新点)即可,装好环境即可使用. 2. 后续的改进方案都会基于这个项目更新进行发布,在群公告进行更新百度云链接. 3. 购买了RT-DETR项目的都会赠送yolov5-PAGCP通道剪枝算法代码和相关实验参数命令. 4. 购买后进RT-DETR交流群(代码视频均在群公告),群里可交流代码和论文相关. 5. 项目因为(价格问题)不附带一对一私人答疑服务,群里附带答疑服务,平时我有时间都会回复群里部分问题. 6. RT-DETR项目包含多种基准模型改进方案(RT-DETR-R18,RT-DETR-R50,RT-DETR-L,Yolov8-Detr,Yolov5-Detr),具体可点击[目前已有的改进方案和更新详细公告](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/rtdetr-project.md)看详细. 7. 里面配备使用说明(部分改进点使用复杂度高、二次创新、原创的模块都会有对应的视频进行说明) 5. 基于YOLOV8V10V11V12的剪枝蒸馏项目. 注意: 1. 本次项目就直接提供几个文件,到时候会提供教程,自行复制到项目一/二上即可跑,原理上其他版本应该也可以跑,但是开发的时候我是基于项目一/二的(ultralytics版本号:v8.1.9、v8.2.50、v8.3.1)上开发的,附近的版本的话应该也可以跑,但是没办法一一验证,所以需自行考虑! 2. 里面会提供一个官方纯净版的(ultralytics版本号:8.1.9、8.2.50、8.3.1、8.3.78)的ultralytics以及其对应的剪枝蒸馏代码,以便没有购买项目一/二的同学使用。 剪枝:[项目详解](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/yolov8-compress.md)(89.9¥) 1. 模型轻量化,部署,大论文堆工作量必备之一! 2. 项目里面配套剪枝示例(示例中是基于项目一/二的改进代码进行剪枝,如没有入手项目一/二是不包含这部分代码的,但对你理解剪枝操作没影响),并且都配有视频讲解,供大家理解如何进行剪枝. 3. 购买后进YOLOV8V10V11V12剪枝交流群(代码视频均在群公告),因为剪枝操作有一定的难度,所以剪枝问题可以群里提问,我都会群里回复相关问题. 4. 支持yolov8中的目标检测、实例分割、姿态检测、旋转目标检测剪枝、yolov10目标检测剪枝、yolo11/12(目标检测、实例分割、姿态检测、旋转目标检测剪枝)。 蒸馏:[项目详解](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/yolov8-distill.md)(89.9¥) 1. 模型轻量化,部署,大论文堆工作量必备之一! 2. 项目里面配套蒸馏示例(部分示例中是基于项目一/二的改进代码进行蒸馏,如没有入手项目一/二是不包含这部分代码的,但对你理解蒸馏操作没影响),并且都配有视频讲解,供大家理解如何进行蒸馏. 3. 购买后进YOLOV8V10V11V12蒸馏交流群(代码视频均在群公告),因为蒸馏操作有一定的难度,所以蒸馏操作问题可以群里提问,我都会群里回复相关问题. 4. 支持yolov8中的目标检测、实例分割、姿态检测、旋转目标检测蒸馏、yolov10目标检测蒸馏、yolo11/12(目标检测、实例分割、姿态检测、旋转目标检测蒸馏)。 5. 实例分割、姿态检测、旋转目标检测暂不支持BCKD蒸馏方法. 6. 基于Ultralytics的RT-DETR(CVPR2024)的剪枝蒸馏项目. 注意:基于Ultralytics的RT-DETR的剪枝蒸馏项目是基于项目四上进行开发的,所以入手剪枝蒸馏项目也需要项目四才能使用。 剪枝:[项目详解](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/rtdetr-compress.md)(89.9¥) 1. 模型轻量化,部署,大论文堆工作量必备之一! 2. 项目里面配套剪枝示例(包含一些项目四中的改进模型的剪枝教程),并且都配有视频讲解,供大家理解如何进行蒸馏. 3. 购买后进RTDETR剪枝交流群(代码视频均在群公告),因为剪枝操作有一定的难度,所以剪枝操作问题可以群里提问,我都会群里回复相关问题. 4. 经过我目前的实验,rtdetr很难进行稀疏训练,因此本项目目前不包含稀疏训练的剪枝方法,如果一定要进行稀疏训练的剪枝慎入,目前项目包含6种不需要稀疏训练方法的剪枝. 蒸馏:[项目详解](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/rtdetr-distill.md)(69.9¥) 1. 模型轻量化,部署,大论文堆工作量必备之一! 2. 项目里面配套蒸馏示例,并且都配有视频讲解,供大家理解如何进行蒸馏. 3. 购买后进RTDETR蒸馏交流群(代码视频均在群公告),因为蒸馏操作有一定的难度,所以蒸馏操作问题可以群里提问,我都会群里回复相关问题. 4. 知识蒸馏整体修改难度大,代表少人使用,物以稀为贵,增加文章的创新度! 7. 基于CVPR2025-DEIM的改进项目.(288¥) 项目详细介绍请看[此处](https://github.com/z1069614715/objectdetection_script/blob/master/cvpr2025-deim-project.md) 1. 相比官方有更多分析的图表,基本论文常用到的都有.(YOLO指标、FPS、模型大小、COCO指标中的每类tsml等等指标、热力图、特征图、漏检误检可视化....) 2. 总所周知DETR系列模型检测头非常难改,需要代码功底和一定知识存储才能改,但本项目有DETR检测头的改进,并且还有视频讲解整体实现原理. 3. 此项目有一些模型创新课题的视频,由我整理一下比较新且有创新空间的模块和讲解视频,想学模块创新一定不可错过. 4. 相比官方的代码修复了很多存在的bug,做科研没有一个稳定的代码框架怎么行呢? 5. 目前包含学生-教师类型的知识蒸馏、模型导出(onnx、tensorrt)、ByteTrack目标跟踪等凑工作量的内容,大小论文一网打尽~ 6. 支持实例分割,给实例分割的同学们多了一个非常nice的选择~ 7. 支持DINOV3主干,即使数据量少,得益于DINOV3性能依然抗打~ 8. 更多请点击上述链接进行查看~ 8. 基于YOLO|RTDETR多模态目标检测项目.(原价288¥,若已购买yolo8101112或rtdetr项目的则优惠50¥=238¥) 项目详细介绍请看[此处](https://github.com/z1069614715/objectdetection_script/blob/master/mutilmodel-project.md) 9. Ultralytics-YOLO改进项目.(99¥) 项目详细介绍请看[此处](https://github.com/z1069614715/objectdetection_script/blob/master/Ultralytics-YOLO-project.md) 1. 本项目集成了YOLOv8、v10、v11、v12乃至前沿的YOLO26等全系列基础模型。 无论是做横向对比实验,还是纵向的版本改进,无需到处找资源,一个项目就能满足你所有的实验需求! 2. 核心代码已实现高度模块化与解耦,专为新手优化。 你完全不需要死磕底层复杂代码,只需像搭积木一样简单修改YAML配置文件,就能轻松实现各种改进模块的自由组合。 3. 面对日益内卷的YOLO赛道,简单的“缝合”已难满足毕业要求。 本项目不仅提供现成的创新方案,更配套独家“二次创新”课程,授人以渔。我们将手把手教你掌握模块设计的底层逻辑,助你从“模仿者”进阶为“创造者”,设计出独属于你的创新模块。 4. 针对有代码基础但受困于Ultralytics复杂架构的同学, 本项目引入了来自DFine、DEIM项目中成熟的“万物皆可融”架构思想。你无需纠结模块注册等信息,只需遵循我所提供的标准接口规范,即可将自定义魔改模块无缝融入YAML配置,与各类CSP变种灵活结合。 5. 实验跑通了,却不知道如何写创新点? 本项目将定期拆解高分论文,传授写作心法,教你如何将实验成果转化为逻辑严密、亮点突出的高质量学术论文,解决写作难题! 6. 毕业设计缺少高大上的展示界面? 别担心,项目会内置基于PyQt或HTML的通用可视化界面,开箱即用,完美补齐毕业论文的最后一块拼图,助你从容应对答辩! 7. 购买即享专属技术交流群, 这里有业内公认的高效答疑服务,以及志同道合的伙伴互助交流。拒绝闭门造车,让我们带你避开深坑,高效通关! **注意:部分功能在项目初期可能尚未实现,将随着项目的持续开发逐步补齐完善。** 10. 基于YOLO和RT-DETR的论文全流程指导项目.(原价238¥,若已购买yolo8101112或rtdetr项目或deim项目的则优惠50¥=188¥)[项目详解](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/paper.md) 我们目前有非常多的代码项目,几乎是全网最全价格最优惠性格比最高的一家,但是难免有些同学在做完实验后还是完全不懂应该怎么去写or不想走太多弯路的情况,因此开展这个基于YOLO和RT-DETR的论文全流程指导项目,本项目致力于帮助那些在论文道路上极其困难的同学,基本上配合上述的一些改进项目和此论文全流程指导项目再加上自己的一点努力可以完全实现毕业无忧,项目简介如下: 1. 直播内容涉及到发论文的整个论文框架体系的方方面面,每次直播都会优先讲大家最想听的部分,根据课程目录投票决定。 2. 直播答疑每个人的问题,上课前会使用excel表格在线收集大家的问题,直播时集中讲解。 3. 直播的回放视频会实时上传到百度网盘,并且视频均为加密视频,一人一机一码,且课程目录的每部分对应检索直播回放视频链接方便大家后续查找,实时更新百度网盘链接内容和使用说明文档。 4. 购买后进论文指导交流群(视频均在群公告),群里可交流论文相关。 5. 项目不附带私人答疑服务,群里附带答疑服务,平时我有时间都会回复群里部分问题。 6. 不定时收集群友反馈,有问题可以在群内随时提出,逐步完善课程体系,让大家高效快速发出论文。 7. 项目有效期为一年,时间从付费进群那天开始算,例如我2024年5月2日进群,2025年5月2日到期,一年时间足以解决所有论文相关的问题。 8. 项目公开课试听B站链接1:[长达80分钟的<论文中对比实验+消融实验+论文工作量创新点评估+答疑>解答直播回放来啦~](https://www.bilibili.com/video/BV1u5rCYmE4k/) 9. 项目公开课试听B站链接2:[长达60分钟的<实验向论文过渡指导+论文写作顺序+创新性评估+公开答疑>直播回放来啦~](https://www.bilibili.com/video/BV1oJPueREfR/) 10. 项目公开课试听B站链接3:[长达2小时的论文高效画图专题全面剖析:数据可视化+模型图绘制+实验数据分析图+答疑直播,全程高能!!!!](https://www.bilibili.com/video/BV1xEEEzZEUs) ## 导购指南 不知道怎么选?按你的目标直接对号入座: ### 1. 只求毕业,期刊无硬性要求 - 推荐项目:**1、2、9(推荐项目9,性价比最高)** - 适合人群:希望快速跑通实验、以“稳妥毕业”为第一目标。 - 标签:`上手快` `性价比高` `代码投入低` `训练速度快` ### 2. 有期刊要求,但不想深钻代码 - 推荐项目:**4** - 适合人群:希望做出有区分度的实验,但不希望在底层代码上投入过多时间。 - 标签:`上手快` `DETR发论文友好` `代码投入低` ### 3. 追逐热点,愿意学代码,追求创新,冲刺SCI - 推荐项目:**7、8** - 适合人群:愿意投入更多时间做方法创新、实验分析和前沿方向探索。 - 标签:`前沿热点` `创新空间大` `冲刺高区SCI` ### 4. 大论文需要凑工作量 + 有部署需求 - 推荐项目:**5、6** - 适合人群:希望同时覆盖“剪枝/蒸馏/部署”链路,补齐论文工作量与落地内容。 - 标签:`大论文工作量充足` `部署导向` `实用性强` - 注意:项目 6 基于项目 4 开发,需配合项目 4 使用。 ### 5. 实验做完后,论文完全不会写 - 推荐项目:**10** - 适合人群:实验已完成,但论文结构、创新表述、图表组织和写作流程缺少方法。 - 标签:`写作指导` `答疑导向` `适合论文收尾` ## 如果上述项目还不能满足您的需求,我们这里还有专业AI算法定制~ ![Advertising Board](https://github.com/z1069614715/objectdetection_script/blob/master/Customization.png) ## GPU服务器推荐 为了让大家在科研路上一路畅通、降低初期上手难度、并且降低大家租服务器的成本,这边联合多个平台提供一个稳定、快速、便宜的服务器租用平台给大家,经过多次沟通,在我的链接上注册or充值可以给到大家福利如下: ---------------------------------------- 智算云扉 ---------------------------------------- 1. 价格非常优惠,几乎全网最低。3090:0.99/h,4090d最低:1.18/h,4090-24GB:最低1.78/h,4090D-48G:2.52/h,4090-48GB:3.19/h 2. 使用我的专属优惠码进行充值可以额外获取百分之5的算力点。举个例子:我要充100,本来我只能得100算力点,使用我的优惠码后,可以得到105算力点!下单链接:https://waas.aigate.cc/user/charge?channel=BLBLMGMJ&coupon=DLJGKNBEE1 或者手动填优惠码:DLJGKNBEE1,点击验证即可。优惠码界面在充值入口里面 3. 智算云扉平台上,我已经提供好我自己改进项目的专属镜像、镜像里面会给大家配置好环境、并且相对应需要编译的模型都会给大家配置好、真正实现上传数据集和代码立刻开跑!跑实验也快人一步!直接在镜像社区/云扉工坊搜索yolo关键词就可以看到。 4. 智算云扉平台上,我为大家提供了一些常用的数据集,并且格式已经转换好,包含COCO2017,VOC2007+2012,CrowdHuman,Visdrone2019,BDD100K. 5. 支持无卡模式开机、支持绑定百度云账号,直接把网盘的内容秒传到云磁盘,省下数据集上传的时间! 6. 可以通过qq搜索以下群号:798692951,添加智算云扉平台交流群,里面有智算云扉官方的客服帮助大家答疑相关平台的问题! 7. B站视频教程:https://www.bilibili.com/video/BV11DXTYiENS/ 8. 20260114更新:数据集的位置有所变动,请看这期视频:https://www.bilibili.com/video/BV1TDrLBfEr7/ ---------------------------------------- DAModel ---------------------------------------- 1. 在DAModel平台上现有的优惠折扣上,额外加上(按需95折、包日97折、包月99折扣优惠),假如平台租用一台4090按每小时是2.18,假设平台的优惠福利是85折,那么在我的用户下再加上95折,最终价格就是2.18*0.85*0.95=1.76!(优惠目前仅限4090相关服务器) 2. DAModel平台上,我已经提供好我自己改进项目的专属镜像、镜像里面会给大家配置好环境、并且相对应需要编译的模型都会给大家配置好、真正实现上传数据集和代码立刻开跑!跑实验也快人一步!视频参考:https://www.bilibili.com/video/BV1mg2SYGEGF/ 3. DAModel平台上,我为大家提供了一些常用的数据集,并且格式已经转换好,包含COCO2017,VOC2007+2012,CrowdHuman,Visdrone2019,BDD100K. 视频参考:https://www.bilibili.com/video/BV1UV5qzuEGf/ 4. 谨记,以上福利仅在以下注册链接上进行注册才享有!注册链接:https://damodel.com/register?source=47EC6199 5. 可以通过qq搜索以下群号:728938131,添加DAModel平台交流群,里面有DAModel官方的客服帮助大家答疑相关平台的问题! # Explanation - **yolo** yolo文件夹是针对yolov5,yolov7,yolov8的数据集处理脚本,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/yolo/readme.md). 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1tM411a7it/). - **damo-yolo** damo-yolo文件夹是针对DAMO-YOLO的数据集处理脚本,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/damo-yolo/readme.md). 目前只支持voc转coco. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1M24y1v7Uf/). - **yolo-improve** yolo-improve文件夹是提供一些关于yolo系列模型改进思路的源码,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/readme.md). - **yolo-gradcam** yolo-gradcam文件夹是提供一些关于可视化yolo模型的热力图的源码,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-gradcam/README.md). - **cv-attention** cv-attention文件夹是关于CV的一些经典注意力机制,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/cv-attention/readme.md). - **objectdetection-tricks** objectdetection-tricks文件夹是关于目标检测中各种小技巧,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/objectdetection-tricks/readme.md). - **mmdet-course** mmdet-course文件夹是提供mmdet教程相关资料,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/mmdet-course/readme.md) - **data-offline-aug** data-offline-aug文件夹是关于图像任务的离线数据增强脚本,具体可看[readme.md](https://github.com/z1069614715/objectdetection_script/blob/master/data-offline-aug/readme.md) [![Forkers repo roster for @z1069614715/objectdetection_script](https://reporoster.com/forks/z1069614715/objectdetection_script)](https://github.com/z1069614715/objectdetection_script/network/members) [![Stargazers repo roster for @z1069614715/objectdetection_script](https://reporoster.com/stars/z1069614715/objectdetection_script)](https://github.com/z1069614715/objectdetection_script/stargazers) # Star History [![Star History Chart](https://api.star-history.com/svg?repos=z1069614715/objectdetection_script&type=Date)](https://star-history.com/#z1069614715/objectdetection_script&Date) ================================================ FILE: visdrone2019-benchmark/readme.md ================================================ # VisDrone2019 Testset Benchmark ### Visdrone2019 测试集(1610张图) COCO指标 (有需要使用对比实验数据的同学可以直接用) ### Jetson Orin Nano 4G TensorRT(8.6.2) FP16 BatchSize=1 ### RTX4090D TensorRT(10.11.0) FP16 BatchSize=1 ![Visdrone2019 Benchmark](https://github.com/z1069614715/objectdetection_script/blob/master/visdrone2019-benchmark/visdrone_ap_gflops_params_bubble.svg) | model | Input Shape | GFlops | Params | Ap | Ap50 | APs | APm | APl | FPS(Jetson Orin Nano 4G) | FPS(RTX4090D) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | Faster-RCNN-R50-FPN-CIOU | (768, 1344) | 208G | 41.39M | 0.194 | 0.329 | 0.095 | 0.309 | 0.429 | - | - | | Cascade-RCNN-R50-FPN | (768, 1344) | 236G | 69.29M | 0.197 | 0.326 | 0.099 | 0.309 | 0.406 | - | - | | ATSS-R50-FPN-DyHead | (768, 1344) | 110G | 38.91M | 0.204 | 0.338 | 0.100 | 0.317 | 0.485 | - | - | | TOOD-R50 | (768, 1344) | 199G | 32.04M | 0.204 | 0.339 | 0.102 | 0.317 | 0.403 | - | - | | DINO | (750, 1333) | 274G | 47.56M | 0.253 | 0.445 | 0.150 | 0.371 | 0.503 | - | - | | DDQ | (768, 1333) | - | - | 0.268 | 0.463 | 0.159 | 0.390 | 0.526 | - | - | | YOLOX-Tiny | (640, 640) | 7.578G | 5.035M | 0.148 | 0.278 | 0.076 | 0.221 | 0.278 | - | - | | GFL | (768, 1344) | 206G | 32.279M | 0.193 | 0.321 | 0.094 | 0.300 | 0.409 | - | - | | RTMDet-Tiny | (640, 640) | 8.033G | 4.876M | 0.184 | 0.312 | 0.077 | 0.288 | 0.445 | - | - | | RetinaNet-R50-FPN | (768, 1344) | 210G | 36.517M | 0.164 | 0.276 | 0.060 | 0.274 | 0.427 | - | - | | RTDETR-R18(Ultralytics版本实现) | (640, 640) | 57G | 19.885M | 0.208 | 0.363 | 0.113 | 0.305 | 0.413 | 28.3 | 889.75 | | D-Fine-N | (640, 640) | 7.1238G | 3.73M | 0.183 | 0.334 | 0.093 | 0.270 | 0.442 | 53.5 | 924.63 | | D-Fine-S | (640, 640) | 24.8595G | 10.18M | 0.227 | 0.394 | 0.128 | 0.331 | 0.468 | 29.9 | 696.18 | | D-Fine-M | (640, 640) | 56.3726G | 19.19M | 0.239 | 0.416 | 0.136 | 0.346 | 0.464 | 18.2 | 480.95 | | D-Fine-L | (640, 640) | 90.7205G | 30.67M | 0.244 | 0.421 | 0.137 | 0.353 | 0.522 | - | - | | D-Fine-L-4scale(P2345) | (640, 640) | 214.587G | 33.75M | 0.270 | 0.459 | 0.165 | 0.380 | 0.521 | - | - | | D-Fine-Dinov3(ConvNext-Tiny)-L | (640, 640) | 117.212G | 44.41M | 0.244 | 0.424 | 0.133 | 0.361 | 0.496 | 7.7 | 411.95 | | D-Fine-Dinov3(ConvNext-Tiny)-L-4scale(P2345) | (640, 640) | 152.504G | 41.18M | 0.284 | 0.480 | 0.178 | 0.398 | 0.526 | - | - | | DEIM-D-Fine-N | (640, 640) | 7.1238G | 3.73M | 0.177 | 0.322 | 0.090 | 0.262 | 0.376 | 53.5 | 924.63 | | DEIM-D-Fine-S | (640, 640) | 24.8595G | 10.18M | 0.219 | 0.384 | 0.122 | 0.321 | 0.397 | 29.9 | 696.18 | | DEIM-D-Fine-M | (640, 640) | 56.3726G | 19.19M | 0.242 | 0.417 | 0.139 | 0.344 | 0.485 | 18.2 | 480.95 | | DEIMV2-S | (640, 640) | 25.3903G | 9.67M | 0.204 | 0.363 | 0.109 | 0.299 | 0.451 | 16.5 | 569.92 | | RTDETR-R18(官方pytorch版本) | (640, 640) | 60G | 20M | 0.185 | 0.333 | 0.139 | 0.275 | 0.423 | - | - | | RTDETRV2-R18(官方pytorch版本) | (640, 640) | 60G | 20M | 0.222 | 0.391 | 0.127 | 0.321 | 0.456 | - | - | | YOLOV5n | (640, 640) | 4.2G | 1.77M | 0.099 | 0.205 | 0.046 | 0.154 | 0.231 | - | - | | YOLOV5s | (640, 640) | 15.8G | 7.04M | 0.130 | 0.257 | 0.062 | 0.201 | 0.259 | - | - | | YOLOV5m | (640, 640) | 48.0G | 20.89M | 0.152 | 0.288 | 0.073 | 0.233 | 0.306 | - | - | | YOLO8n | (640, 640) | 8.1G | 3.0M | 0.144 | 0.259 | 0.059 | 0.225 | 0.339 | - | 2114.04 | | YOLO8n | (960, 960) | 18.5G | 3.0M | 0.192 | 0.333 | 0.099 | 0.288 | 0.377 | - | 1506.86 | | YOLO8s | (640, 640) | 28.5G | 11.13M | 0.173 | 0.307 | 0.078 | 0.269 | 0.372 | - | 1607.19 | | YOLO8s | (960, 960) | 64.5G | 11.13M | 0.224 | 0.386 | 0.123 | 0.333 | 0.441 | - | 1128.2 | | YOLO8m | (640, 640) | 78.7G | 25.85M | 0.190 | 0.332 | 0.090 | 0.294 | 0.417 | - | 924.37 | | YOLO10n | (640, 640) | 6.5G | 2.28M | 0.142 | 0.261 | 0.063 | 0.224 | 0.292 | - | 1694.1 | | YOLO10s | (640, 640) | 21.4G | 7.22M | 0.179 | 0.323 | 0.086 | 0.278 | 0.361 | - | 1336.88 | | YOLO10m | (640, 640) | 58.9G | 15.32M | 0.195 | 0.345 | 0.097 | 0.300 | 0.414 | - | 842.27 | | YOLO11n | (640, 640) | 6.3G | 2.59M | 0.142 | 0.258 | 0.058 | 0.225 | 0.316 | 94.2 | 1425.91 | | YOLO11s | (640, 640) | 21.3G | 9.42M | 0.176 | 0.313 | 0.080 | 0.272 | 0.364 | 56.4 | 1171.25 | | YOLO11m | (640, 640) | 67.7G | 20.04M | 0.203 | 0.350 | 0.098 | 0.312 | 0.413 | 28.9 | 752.8 | | YOLO12n | (640, 640) | 6.3G | 2.56M | 0.142 | 0.259 | 0.057 | 0.224 | 0.346 | - | 1133.07 | | YOLO12s | (640, 640) | 21.2G | 9.23M | 0.176 | 0.312 | 0.081 | 0.274 | 0.356 | - | 901.36 | | YOLO12m | (640, 640) | 67.2G | 20.11M | 0.192 | 0.336 | 0.094 | 0.298 | 0.386 | - | 648.88 | | [FBRT-YOLO-N](https://arxiv.org/abs/2504.20670) | (640, 640) | 6.7G | 0.8M | 0.148 | 0.265 | 0.062 | 0.234 | 0.323 | - | - | | [FBRT-YOLO-S](https://arxiv.org/abs/2504.20670) | (640, 640) | 22.9G | 2.9M | 0.183 | 0.323 | 0.085 | 0.283 | 0.425 | - | - | | [FBRT-YOLO-M](https://arxiv.org/abs/2504.20670) | (640, 640) | 58.7G | 7.36M | 0.196 | 0.344 | 0.094 | 0.309 | 0.421 | - | - | | YOLO13n | (640, 640) | 6.2G | 2.45M | 0.133 | 0.244 | 0.055 | 0.210 | 0.317 | - | - | | YOLO13s | (640, 640) | 20.1G | 9.0M | 0.167 | 0.297 | 0.077 | 0.258 | 0.387 | - | - | | YOLO8m-worldv2 | (640, 640) | 88.1G | 28.36M | 0.186 | 0.326 | 0.085 | 0.288 | 0.419 | - | - | | YOLOE-11m | (640, 640) | 67.7G | 20.04M | 0.195 | 0.339 | 0.092 | 0.301 | 0.427 | - | - | | YOLO26n | (640, 640) | 5.2G | 2.38M | 0.135 | 0.249 | 0.063 | 0.203 | 0.291 | - | 1495.93 | | YOLO26n | (960, 960) | 11.7G | 2.38M | 0.185 | 0.322 | 0.100 | 0.271 | 0.377 | - | 1197 | | YOLO26s | (640, 640) | 20.5G | 9.47M | 0.160 | 0.294 | 0.082 | 0.240 | 0.362 | - | 1229.47 | | YOLO26m | (640, 640) | 67.9G | 20.36M | 0.186 | 0.332 | 0.096 | 0.281 | 0.361 | - | 866.74 | ================================================ FILE: yolo/data.yaml ================================================ # dataset path train: ./dataset/images/train val: ./dataset/images/val test: ./dataset/images/test # number of classes nc: # class names names: [] ================================================ FILE: yolo/dataset/VOCdevkit/Annotations/ReadMe.md ================================================ # 存放VOC标注格式的文件夹 ================================================ FILE: yolo/dataset/VOCdevkit/JPEGImages/ReadMe.md ================================================ # 存放图像的文件夹 ================================================ FILE: yolo/dataset/VOCdevkit/txt/ReadMe.md ================================================ # 存放YOLO标注格式的文件夹 ================================================ FILE: yolo/dataset/split_data.py ================================================ import os, shutil, random random.seed(0) import numpy as np from sklearn.model_selection import train_test_split val_size = 0.1 test_size = 0.2 postfix = 'jpg' imgpath = 'VOCdevkit/JPEGImages' txtpath = 'VOCdevkit/txt' os.makedirs('images/train', exist_ok=True) os.makedirs('images/val', exist_ok=True) os.makedirs('images/test', exist_ok=True) os.makedirs('labels/train', exist_ok=True) os.makedirs('labels/val', exist_ok=True) os.makedirs('labels/test', exist_ok=True) listdir = np.array([i for i in os.listdir(txtpath) if 'txt' in i]) random.shuffle(listdir) train, val, test = listdir[:int(len(listdir) * (1 - val_size - test_size))], listdir[int(len(listdir) * (1 - val_size - test_size)):int(len(listdir) * (1 - test_size))], listdir[int(len(listdir) * (1 - test_size)):] print(f'train set size:{len(train)} val set size:{len(val)} test set size:{len(test)}') for i in train: shutil.copy('{}/{}.{}'.format(imgpath, i[:-4], postfix), 'images/train/{}.{}'.format(i[:-4], postfix)) shutil.copy('{}/{}'.format(txtpath, i), 'labels/train/{}'.format(i)) for i in val: shutil.copy('{}/{}.{}'.format(imgpath, i[:-4], postfix), 'images/val/{}.{}'.format(i[:-4], postfix)) shutil.copy('{}/{}'.format(txtpath, i), 'labels/val/{}'.format(i)) for i in test: shutil.copy('{}/{}.{}'.format(imgpath, i[:-4], postfix), 'images/test/{}.{}'.format(i[:-4], postfix)) shutil.copy('{}/{}'.format(txtpath, i), 'labels/test/{}'.format(i)) ================================================ FILE: yolo/dataset/xml2txt.py ================================================ import xml.etree.ElementTree as ET import os, cv2 import numpy as np from os import listdir from os.path import join classes = [] def convert(size, box): dw = 1. / (size[0]) dh = 1. / (size[1]) x = (box[0] + box[1]) / 2.0 - 1 y = (box[2] + box[3]) / 2.0 - 1 w = box[1] - box[0] h = box[3] - box[2] x = x * dw w = w * dw y = y * dh h = h * dh return (x, y, w, h) def convert_annotation(xmlpath, xmlname): with open(xmlpath, "r", encoding='utf-8') as in_file: txtname = xmlname[:-4] + '.txt' txtfile = os.path.join(txtpath, txtname) tree = ET.parse(in_file) root = tree.getroot() filename = root.find('filename') img = cv2.imdecode(np.fromfile('{}/{}.{}'.format(imgpath, xmlname[:-4], postfix), np.uint8), cv2.IMREAD_COLOR) h, w = img.shape[:2] res = [] for obj in root.iter('object'): cls = obj.find('name').text if cls not in classes: classes.append(cls) cls_id = classes.index(cls) xmlbox = obj.find('bndbox') b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text)) bb = convert((w, h), b) res.append(str(cls_id) + " " + " ".join([str(a) for a in bb])) if len(res) != 0: with open(txtfile, 'w+') as f: f.write('\n'.join(res)) if __name__ == "__main__": postfix = 'jpg' imgpath = 'VOCdevkit/JPEGImages' xmlpath = 'VOCdevkit/Annotations' txtpath = 'VOCdevkit/txt' if not os.path.exists(txtpath): os.makedirs(txtpath, exist_ok=True) list = os.listdir(xmlpath) error_file_list = [] for i in range(0, len(list)): try: path = os.path.join(xmlpath, list[i]) if ('.xml' in path) or ('.XML' in path): convert_annotation(path, list[i]) print(f'file {list[i]} convert success.') else: print(f'file {list[i]} is not xml format.') except Exception as e: print(f'file {list[i]} convert error.') print(f'error message:\n{e}') error_file_list.append(list[i]) print(f'this file convert failure\n{error_file_list}') print(f'Dataset Classes:{classes}') ================================================ FILE: yolo/readme.md ================================================ # YOLOV5,YOLOV7,YOLOV8的数据集处理文件 本目录下的脚本是针对与yolov5,v7,v8的数据集处理脚本,支持如下: 1. VOC标注格式转换为YOLO标注格式。 2. 对数据集进行划分训练集,验证集,测试集。 # VOC标注格式数据集使用示例 1. 把图片存放在dataset\VOCdevkit\JPEGImages中,图片后缀需要一致,比如都是jpg或者png等等,不支持混合的图片后缀格式,比如一些是jpg,一些是png。 2. 把VOC标注格式的XML文件存放在dataset\VOCdevkit\Annotations中。 3. 运行xml2txt.py,在这个文件中其会把Annotations中的XML格式标注文件转换到txt中的yolo格式标注文件。其中xml2txt.py中的postfix参数是JPEGImages的图片后缀,修改成图片的后缀即可,默认为jpg。比如我的图片都是png后缀的,需要把postfix修改为png即可。其中运行这个文件的时候,输出信息会输出你的数据集的类别,你需要把类别列表复制到data.yaml中的names中,并且修改nc为你的类别数,也就是names中类别个数。 4. 运行split_data.py,这个文件是划分训练、验证、测试集。其中支持修改val_size**验证集比例**和test_size**测试集比例**,可以在split_data.py中找到对应的参数进行修改,然后postfix参数也是你的图片数据集后缀格式,默认为jpg,如果你的图片后缀不是jpg结尾的话,需要修改一下这个参数。 # YOLO标注格式数据集使用示例 1. 把图片存放在dataset\VOCdevkit\JPEGImages中,图片后缀需要一致,比如都是jpg或者png等等,不支持混合的图片后缀格式,比如一些是jpg,一些是png。 2. 把YOLO标注格式的TXT文件存放在dataset\VOCdevkit\txt中。 3. 运行split_data.py,这个文件是划分训练、验证、测试集。其中支持修改val_size**验证集比例**和test_size**测试集比例**,可以在split_data.py中找到对应的参数进行修改,然后postfix参数也是你的图片数据集后缀格式,默认为jpg,如果你的图片后缀不是jpg结尾的话,需要修改一下这个参数。 4. 在data.yaml中的names设置你的类别,其为一个list,比如我的YOLO标注格式数据集中,0代表face,1代表body,那在data.yaml中就是names:['face', 'body'],然后nc:2,nc就是类别个数。 ================================================ FILE: yolo-gradcam/README.md ================================================ # yolo-gradcam yolo model with gradcam visual. 即插即用,不需要对源码进行任何修改! ## 哔哩哔哩视频教学地址 1. yolov5-[哔哩哔哩地址](https://www.bilibili.com/video/BV1F6421V77v/) 2. yolov7-[哔哩哔哩地址](https://www.bilibili.com/video/BV1F6421V77v/) 3. yolov8-[哔哩哔哩地址](https://www.bilibili.com/video/BV1T2N6eaEFD/) 4. yolov9-[哔哩哔哩地址](https://www.bilibili.com/video/BV14H4y157MP/) 5. yolov11-[哔哩哔哩地址](https://www.bilibili.com/video/BV1T2N6eaEFD/) ## 环境 pip install grad-cam==1.4.8 -i https://pypi.tuna.tsinghua.edu.cn/simple ## 注意事项 1. yolov5是在v7.0进行编写和测试的。 2. yolov7是在2023.10.1号的版本进行编写和测试的。 3. yolov8是在2024.1.31号的版本进行编写和测试的。 4. yolov9是在2024.3.7号的版本进行编写和测试的。 5. 建议在新版本下进行使用,旧版本可能会有报错,需要自行解决。 ================================================ FILE: yolo-gradcam/yolov11_heatmap.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil, sys, copy import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from ultralytics import YOLO from ultralytics.nn.tasks import attempt_load_weights from ultralytics.utils.torch_utils import intersect_dicts from ultralytics.utils.ops import xywh2xyxy, non_max_suppression from pytorch_grad_cam import GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM, KPCA_CAM, AblationCAM from pytorch_grad_cam.utils.image import show_cam_on_image, scale_cam_image from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32): # Resize and pad image while meeting stride-multiple constraints shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if auto: # minimum rectangle dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border return im, ratio, (top, bottom, left, right) class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targetted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook(self.save_activation)) # Because of https://github.com/pytorch/pytorch/issues/61519, # we don't use backward hook to record gradients. self.handles.append( target_layer.register_forward_hook(self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, input, output): if not hasattr(output, "requires_grad") or not output.requires_grad: # You can only register hooks on tensor requires grad. return # Gradients are computed in reverse order def _store_grad(grad): if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients output.register_hook(_store_grad) def post_process(self, result): if self.model.end2end: logits_ = result[:, :, 4:] boxes_ = result[:, :, :4] sorted, indices = torch.sort(logits_[:, :, 0], descending=True) return logits_[0][indices[0]], boxes_[0][indices[0]] elif self.model.task == 'detect': logits_ = result[:, 4:] boxes_ = result[:, :4] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'segment': logits_ = result[0][:, 4:4 + self.model.nc] boxes_ = result[0][:, :4] mask_p, mask_nm = result[1][2].squeeze(), result[1][1].squeeze().transpose(1, 0) c, h, w = mask_p.size() mask = (mask_nm @ mask_p.view(c, -1)) sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], mask[indices[0]] elif self.model.task == 'pose': logits_ = result[:, 4:4 + self.model.nc] boxes_ = result[:, :4] poses_ = result[:, 4 + self.model.nc:] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(poses_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'obb': logits_ = result[:, 4:4 + self.model.nc] boxes_ = result[:, :4] angles_ = result[:, 4 + self.model.nc:] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(angles_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'classify': return result[0] def __call__(self, x): self.gradients = [] self.activations = [] model_output = self.model(x) if self.model.task == 'detect': post_result, pre_post_boxes = self.post_process(model_output[0]) return [[post_result, pre_post_boxes]] elif self.model.task == 'segment': post_result, pre_post_boxes, pre_post_mask = self.post_process(model_output) return [[post_result, pre_post_boxes, pre_post_mask]] elif self.model.task == 'pose': post_result, pre_post_boxes, pre_post_pose = self.post_process(model_output[0]) return [[post_result, pre_post_boxes, pre_post_pose]] elif self.model.task == 'obb': post_result, pre_post_boxes, pre_post_angle = self.post_process(model_output[0]) return [[post_result, pre_post_boxes, pre_post_angle]] elif self.model.task == 'classify': data = self.post_process(model_output) return [data] def release(self): for handle in self.handles: handle.remove() class yolo_detect_target(torch.nn.Module): def __init__(self, ouput_type, conf, ratio, end2end) -> None: super().__init__() self.ouput_type = ouput_type self.conf = conf self.ratio = ratio self.end2end = end2end def forward(self, data): post_result, pre_post_boxes = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if (self.end2end and float(post_result[i, 0]) < self.conf) or (not self.end2end and float(post_result[i].max()) < self.conf): break if self.ouput_type == 'class' or self.ouput_type == 'all': if self.end2end: result.append(post_result[i, 0]) else: result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) return sum(result) class yolo_segment_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_mask = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) elif self.ouput_type == 'segment' or self.ouput_type == 'all': result.append(pre_post_mask[i].mean()) return sum(result) class yolo_pose_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_pose = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) elif self.ouput_type == 'pose' or self.ouput_type == 'all': result.append(pre_post_pose[i].mean()) return sum(result) class yolo_obb_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_angle = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) elif self.ouput_type == 'obb' or self.ouput_type == 'all': result.append(pre_post_angle[i]) return sum(result) class yolo_classify_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): return data.max() class yolo_heatmap: def __init__(self, weight, device, method, layer, backward_type, conf_threshold, ratio, show_result, renormalize, task, img_size): device = torch.device(device) model_yolo = YOLO(weight) model_names = model_yolo.names print(f'model class info:{model_names}') model = copy.deepcopy(model_yolo.model) model.to(device) model.info() for p in model.parameters(): p.requires_grad_(True) model.eval() model.task = task if not hasattr(model, 'end2end'): model.end2end = False if task == 'detect': target = yolo_detect_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'segment': target = yolo_segment_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'pose': target = yolo_pose_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'obb': target = yolo_obb_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'classify': target = yolo_classify_target(backward_type, conf_threshold, ratio, model.end2end) else: raise Exception(f"not support task({task}).") target_layers = [model.model[l] for l in layer] method = eval(method)(model, target_layers) method.activations_and_grads = ActivationsAndGradients(model, target_layers, None) colors = np.random.uniform(0, 255, size=(len(model_names), 3)).astype(np.int32) self.__dict__.update(locals()) def post_process(self, result): result = non_max_suppression(result, conf_thres=self.conf_threshold, iou_thres=0.65)[0] return result def draw_detections(self, box, color, name, img): xmin, ymin, xmax, ymax = list(map(int, list(box))) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), tuple(int(x) for x in color), 2) # 绘制检测框 cv2.putText(img, str(name), (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, tuple(int(x) for x in color), 2, lineType=cv2.LINE_AA) # 绘制类别、置信度 return img def renormalize_cam_in_bounding_boxes(self, boxes, image_float_np, grayscale_cam): """Normalize the CAM to be in the range [0, 1] inside every bounding boxes, and zero outside of the bounding boxes. """ renormalized_cam = np.zeros(grayscale_cam.shape, dtype=np.float32) for x1, y1, x2, y2 in boxes: x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(grayscale_cam.shape[1] - 1, x2), min(grayscale_cam.shape[0] - 1, y2) renormalized_cam[y1:y2, x1:x2] = scale_cam_image(grayscale_cam[y1:y2, x1:x2].copy()) renormalized_cam = scale_cam_image(renormalized_cam) eigencam_image_renormalized = show_cam_on_image(image_float_np, renormalized_cam, use_rgb=True) return eigencam_image_renormalized def process(self, img_path, save_path): # img process try: img = cv2.imdecode(np.fromfile(img_path, np.uint8), cv2.IMREAD_COLOR) except: print(f"Warning... {img_path} read failure.") return img, _, (top, bottom, left, right) = letterbox(img, new_shape=(self.img_size, self.img_size), auto=True) # 如果需要完全固定成宽高一样就把auto设置为False img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 tensor = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) print(f'tensor size:{tensor.size()}') try: grayscale_cam = self.method(tensor, [self.target]) except AttributeError as e: print(f"Warning... self.method(tensor, [self.target]) failure.") return grayscale_cam = grayscale_cam[0, :] cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True) pred = self.model_yolo.predict(tensor, conf=self.conf_threshold, iou=0.7)[0] if self.renormalize and self.task in ['detect', 'segment', 'pose']: cam_image = self.renormalize_cam_in_bounding_boxes(pred.boxes.xyxy.cpu().detach().numpy().astype(np.int32), img, grayscale_cam) if self.show_result: cam_image = pred.plot(img=cam_image, conf=True, # 显示置信度 font_size=None, # 字体大小,None为根据当前image尺寸计算 line_width=None, # 线条宽度,None为根据当前image尺寸计算 labels=False, # 显示标签 ) # 去掉padding边界 cam_image = cam_image[top:cam_image.shape[0] - bottom, left:cam_image.shape[1] - right] cam_image = Image.fromarray(cam_image) cam_image.save(save_path) def __call__(self, img_path, save_path): # remove dir if exist if os.path.exists(save_path): shutil.rmtree(save_path) # make dir if not exist os.makedirs(save_path, exist_ok=True) if os.path.isdir(img_path): for img_path_ in os.listdir(img_path): self.process(f'{img_path}/{img_path_}', f'{save_path}/{img_path_}') else: self.process(img_path, f'{save_path}/result.png') def get_params(): params = { 'weight': 'yolo11n.pt', # 现在只需要指定权重即可,不需要指定cfg 'device': 'cuda:0', 'method': 'GradCAMPlusPlus', # GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM, KPCA_CAM 'layer': [10, 12, 14, 16, 18], 'backward_type': 'all', # detect: segment: pose: obb: classify: 'conf_threshold': 0.2, # 0.2 'ratio': 0.02, # 0.02-0.1 'show_result': True, # 不需要绘制结果请设置为False 'renormalize': False, # 需要把热力图限制在框内请设置为True(仅对detect,segment,pose有效) 'task':'detect', # 任务(detect,segment,pose,obb,classify) 'img_size':640, # 图像尺寸 } return params # pip install grad-cam==1.5.4 --no-deps if __name__ == '__main__': model = yolo_heatmap(**get_params()) model(r'/home/hjj/Desktop/dataset/dataset_coco/coco/images/val2017/000000361238.jpg', 'result') # model(r'/home/hjj/Desktop/dataset/dataset_coco/coco/images/val2017', 'result') ================================================ FILE: yolo-gradcam/yolov5_heatmap.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from models.yolo import Model from utils.general import intersect_dicts from utils.augmentations import letterbox from utils.general import xywh2xyxy, non_max_suppression from models.experimental import attempt_load from pytorch_grad_cam import GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM from pytorch_grad_cam.utils.image import show_cam_on_image, scale_cam_image from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targetted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook(self.save_activation)) # Because of https://github.com/pytorch/pytorch/issues/61519, # we don't use backward hook to record gradients. self.handles.append( target_layer.register_forward_hook(self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, input, output): if not hasattr(output, "requires_grad") or not output.requires_grad: # You can only register hooks on tensor requires grad. return # Gradients are computed in reverse order def _store_grad(grad): if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients output.register_hook(_store_grad) def post_process(self, result): logits_ = result[..., 4:] boxes_ = result[..., :4] sorted, indices = torch.sort(logits_[..., 0], descending=True) return logits_[0][indices[0]], xywh2xyxy(boxes_[0][indices[0]]).cpu().detach().numpy() def __call__(self, x): self.gradients = [] self.activations = [] model_output = self.model(x) post_result, pre_post_boxes = self.post_process(model_output[0]) return [[post_result, pre_post_boxes]] def release(self): for handle in self.handles: handle.remove() class yolov5_target(torch.nn.Module): def __init__(self, ouput_type, conf, ratio) -> None: super().__init__() self.ouput_type = ouput_type self.conf = conf self.ratio = ratio def forward(self, data): post_result, pre_post_boxes = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i, 1:].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i, 1:].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) return sum(result) class yolov5_heatmap: def __init__(self, weight, device, method, layer, backward_type, conf_threshold, ratio, show_box, renormalize): device = torch.device(device) ckpt = torch.load(weight) model_names = ckpt['model'].names model = attempt_load(weight, device=device) for p in model.parameters(): p.requires_grad_(True) model.eval() target = yolov5_target(backward_type, conf_threshold, ratio) target_layers = [model.model[l] for l in layer] method = eval(method)(model, target_layers, use_cuda=device.type == 'cuda') method.activations_and_grads = ActivationsAndGradients(model, target_layers, None) colors = np.random.uniform(0, 255, size=(len(model_names), 3)).astype(np.int) self.__dict__.update(locals()) def post_process(self, result): result = non_max_suppression(result, conf_thres=self.conf_threshold, iou_thres=0.65)[0] return result def draw_detections(self, box, color, name, img): xmin, ymin, xmax, ymax = list(map(int, list(box))) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), tuple(int(x) for x in color), 2) cv2.putText(img, str(name), (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, tuple(int(x) for x in color), 2, lineType=cv2.LINE_AA) return img def renormalize_cam_in_bounding_boxes(self, boxes, image_float_np, grayscale_cam): """Normalize the CAM to be in the range [0, 1] inside every bounding boxes, and zero outside of the bounding boxes. """ renormalized_cam = np.zeros(grayscale_cam.shape, dtype=np.float32) for x1, y1, x2, y2 in boxes: x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(grayscale_cam.shape[1] - 1, x2), min(grayscale_cam.shape[0] - 1, y2) renormalized_cam[y1:y2, x1:x2] = scale_cam_image(grayscale_cam[y1:y2, x1:x2].copy()) renormalized_cam = scale_cam_image(renormalized_cam) eigencam_image_renormalized = show_cam_on_image(image_float_np, renormalized_cam, use_rgb=True) return eigencam_image_renormalized def process(self, img_path, save_path): # img process img = cv2.imread(img_path) img = letterbox(img)[0] img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 tensor = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) try: grayscale_cam = self.method(tensor, [self.target]) except AttributeError as e: return grayscale_cam = grayscale_cam[0, :] cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True) with torch.no_grad(): pred = self.model(tensor)[0] pred = self.post_process(pred) if self.renormalize: cam_image = self.renormalize_cam_in_bounding_boxes(pred[:, :4].cpu().detach().numpy().astype(np.int32), img, grayscale_cam) if self.show_box: for data in pred: data = data.cpu().detach().numpy() cam_image = self.draw_detections(data[:4], self.colors[int(data[5])], f'{self.model_names[int(data[5])]} {float(data[4]):.2f}', cam_image) cam_image = Image.fromarray(cam_image) cam_image.save(save_path) def __call__(self, img_path, save_path): # remove dir if exist if os.path.exists(save_path): shutil.rmtree(save_path) # make dir if not exist os.makedirs(save_path, exist_ok=True) if os.path.isdir(img_path): for img_path_ in os.listdir(img_path): self.process(f'{img_path}/{img_path_}', f'{save_path}/{img_path_}') else: self.process(img_path, f'{save_path}/result.png') def get_params(): params = { 'weight': 'runs/train/yolov5n_lamp_exp3/weights/best.pt', 'device': 'cuda:0', 'method': 'XGradCAM', # GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM 'layer': [16, 19, 21], 'backward_type': 'all', # class, box, all 'conf_threshold': 0.2, # 0.6 'ratio': 0.02, # 0.02-0.1 'show_box': False, 'renormalize': True } return params if __name__ == '__main__': model = yolov5_heatmap(**get_params()) model(r'/home/hjj/Desktop/dataset/dataset_crowdhuman/images/test', 'result') ================================================ FILE: yolo-gradcam/yolov7_heatmap.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from models.yolo import Model from utils.datasets import letterbox from utils.general import xywh2xyxy, non_max_suppression from models.experimental import attempt_load from pytorch_grad_cam import GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM from pytorch_grad_cam.utils.image import show_cam_on_image, scale_cam_image from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targetted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook(self.save_activation)) # Because of https://github.com/pytorch/pytorch/issues/61519, # we don't use backward hook to record gradients. self.handles.append( target_layer.register_forward_hook(self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, input, output): if not hasattr(output, "requires_grad") or not output.requires_grad: # You can only register hooks on tensor requires grad. return # Gradients are computed in reverse order def _store_grad(grad): if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients output.register_hook(_store_grad) def post_process(self, result): boxes_ = result[0][..., :4] logits_ = [] for data in result[1]: bs, n, w, h, _ = data.size() logits_.append(data.reshape((bs, n * w * h, _))) logits_ = torch.cat(logits_, dim=1)[..., 4:] sorted, indices = torch.sort(logits_[..., 0], descending=True) logits_ = logits_[0][indices[0]] logits_[:, 0] = torch.sigmoid(logits_[:, 0]) return logits_, xywh2xyxy(boxes_[0][indices[0]]).cpu().detach().numpy() def __call__(self, x): self.gradients = [] self.activations = [] model_output = self.model(x) post_result, pre_post_boxes = self.post_process(model_output) return [[post_result, pre_post_boxes]] def release(self): for handle in self.handles: handle.remove() class yolov7_target(torch.nn.Module): def __init__(self, ouput_type, conf, ratio) -> None: super().__init__() self.ouput_type = ouput_type self.conf = conf self.ratio = ratio def forward(self, data): post_result, pre_post_boxes = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i, 1:].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i, 1:].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) return sum(result) class yolov7_heatmap: def __init__(self, weight, device, method, layer, backward_type, conf_threshold, ratio, show_box, renormalize): device = torch.device(device) ckpt = torch.load(weight) model_names = ckpt['model'].names model = attempt_load(weight, device) for p in model.parameters(): p.requires_grad_(True) model.eval() target = yolov7_target(backward_type, conf_threshold, ratio) target_layers = [model.model[l] for l in layer] method = eval(method)(model, target_layers, use_cuda=device.type == 'cuda') method.activations_and_grads = ActivationsAndGradients(model, target_layers, None) colors = np.random.uniform(0, 255, size=(len(model_names), 3)).astype(np.int) self.__dict__.update(locals()) def post_process(self, result): result = non_max_suppression(result, conf_thres=self.conf_threshold, iou_thres=0.65)[0] return result def draw_detections(self, box, color, name, img): xmin, ymin, xmax, ymax = list(map(int, list(box))) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), tuple(int(x) for x in color), 2) cv2.putText(img, str(name), (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, tuple(int(x) for x in color), 2, lineType=cv2.LINE_AA) return img def renormalize_cam_in_bounding_boxes(self, boxes, image_float_np, grayscale_cam): """Normalize the CAM to be in the range [0, 1] inside every bounding boxes, and zero outside of the bounding boxes. """ renormalized_cam = np.zeros(grayscale_cam.shape, dtype=np.float32) for x1, y1, x2, y2 in boxes: x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(grayscale_cam.shape[1] - 1, x2), min(grayscale_cam.shape[0] - 1, y2) renormalized_cam[y1:y2, x1:x2] = scale_cam_image(grayscale_cam[y1:y2, x1:x2].copy()) renormalized_cam = scale_cam_image(renormalized_cam) eigencam_image_renormalized = show_cam_on_image(image_float_np, renormalized_cam, use_rgb=True) return eigencam_image_renormalized def process(self, img_path, save_path): # img process img = cv2.imread(img_path) img = letterbox(img)[0] img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 tensor = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) try: grayscale_cam = self.method(tensor, [self.target]) except AttributeError as e: return grayscale_cam = grayscale_cam[0, :] cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True) with torch.no_grad(): pred = self.model(tensor) pred = self.post_process(pred[0]) if self.renormalize: cam_image = self.renormalize_cam_in_bounding_boxes(pred[:, :4].cpu().detach().numpy().astype(np.int32), img, grayscale_cam) if self.show_box: for data in pred: data = data.cpu().detach().numpy() cam_image = self.draw_detections(data[:4], self.colors[int(data[5])], f'{self.model_names[int(data[5])]} {float(data[4]):.2f}', cam_image) cam_image = Image.fromarray(cam_image) cam_image.save(save_path) def __call__(self, img_path, save_path): # remove dir if exist if os.path.exists(save_path): shutil.rmtree(save_path) # make dir if not exist os.makedirs(save_path, exist_ok=True) if os.path.isdir(img_path): for img_path_ in os.listdir(img_path): self.process(f'{img_path}/{img_path_}', f'{save_path}/{img_path_}') else: self.process(img_path, f'{save_path}/result.png') def get_params(): params = { 'weight': 'runs/train/yolov7_tiny_custom_fasternet_lamp_exp1/weights/best.pt', 'device': 'cuda:0', 'method': 'XGradCAM', # GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM 'layer': [11, 14, 17], 'backward_type': 'all', # class, box, all 'conf_threshold': 0.2, # 0.6 'ratio': 0.02, # 0.02-0.1 'show_box': False, 'renormalize': True } return params if __name__ == '__main__': model = yolov7_heatmap(**get_params()) model(r'/home/hjj/Desktop/dataset/dataset_crowdhuman/images/test', 'result') ================================================ FILE: yolo-gradcam/yolov8_heatmap.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil, sys, copy import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from ultralytics import YOLO from ultralytics.nn.tasks import attempt_load_weights from ultralytics.utils.torch_utils import intersect_dicts from ultralytics.utils.ops import xywh2xyxy, non_max_suppression from pytorch_grad_cam import GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM, KPCA_CAM, AblationCAM from pytorch_grad_cam.utils.image import show_cam_on_image, scale_cam_image from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32): # Resize and pad image while meeting stride-multiple constraints shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if auto: # minimum rectangle dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border return im, ratio, (top, bottom, left, right) class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targetted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook(self.save_activation)) # Because of https://github.com/pytorch/pytorch/issues/61519, # we don't use backward hook to record gradients. self.handles.append( target_layer.register_forward_hook(self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, input, output): if not hasattr(output, "requires_grad") or not output.requires_grad: # You can only register hooks on tensor requires grad. return # Gradients are computed in reverse order def _store_grad(grad): if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients output.register_hook(_store_grad) def post_process(self, result): if self.model.end2end: logits_ = result[:, :, 4:] boxes_ = result[:, :, :4] sorted, indices = torch.sort(logits_[:, :, 0], descending=True) return logits_[0][indices[0]], boxes_[0][indices[0]] elif self.model.task == 'detect': logits_ = result[:, 4:] boxes_ = result[:, :4] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'segment': logits_ = result[0][:, 4:4 + self.model.nc] boxes_ = result[0][:, :4] mask_p, mask_nm = result[1][2].squeeze(), result[1][1].squeeze().transpose(1, 0) c, h, w = mask_p.size() mask = (mask_nm @ mask_p.view(c, -1)) sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], mask[indices[0]] elif self.model.task == 'pose': logits_ = result[:, 4:4 + self.model.nc] boxes_ = result[:, :4] poses_ = result[:, 4 + self.model.nc:] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(poses_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'obb': logits_ = result[:, 4:4 + self.model.nc] boxes_ = result[:, :4] angles_ = result[:, 4 + self.model.nc:] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(angles_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'classify': return result[0] def __call__(self, x): self.gradients = [] self.activations = [] model_output = self.model(x) if self.model.task == 'detect': post_result, pre_post_boxes = self.post_process(model_output[0]) return [[post_result, pre_post_boxes]] elif self.model.task == 'segment': post_result, pre_post_boxes, pre_post_mask = self.post_process(model_output) return [[post_result, pre_post_boxes, pre_post_mask]] elif self.model.task == 'pose': post_result, pre_post_boxes, pre_post_pose = self.post_process(model_output[0]) return [[post_result, pre_post_boxes, pre_post_pose]] elif self.model.task == 'obb': post_result, pre_post_boxes, pre_post_angle = self.post_process(model_output[0]) return [[post_result, pre_post_boxes, pre_post_angle]] elif self.model.task == 'classify': data = self.post_process(model_output) return [data] def release(self): for handle in self.handles: handle.remove() class yolo_detect_target(torch.nn.Module): def __init__(self, ouput_type, conf, ratio, end2end) -> None: super().__init__() self.ouput_type = ouput_type self.conf = conf self.ratio = ratio self.end2end = end2end def forward(self, data): post_result, pre_post_boxes = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if (self.end2end and float(post_result[i, 0]) < self.conf) or (not self.end2end and float(post_result[i].max()) < self.conf): break if self.ouput_type == 'class' or self.ouput_type == 'all': if self.end2end: result.append(post_result[i, 0]) else: result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) return sum(result) class yolo_segment_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_mask = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) elif self.ouput_type == 'segment' or self.ouput_type == 'all': result.append(pre_post_mask[i].mean()) return sum(result) class yolo_pose_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_pose = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) elif self.ouput_type == 'pose' or self.ouput_type == 'all': result.append(pre_post_pose[i].mean()) return sum(result) class yolo_obb_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_angle = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) elif self.ouput_type == 'obb' or self.ouput_type == 'all': result.append(pre_post_angle[i]) return sum(result) class yolo_classify_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): return data.max() class yolo_heatmap: def __init__(self, weight, device, method, layer, backward_type, conf_threshold, ratio, show_result, renormalize, task, img_size): device = torch.device(device) model_yolo = YOLO(weight) model_names = model_yolo.names print(f'model class info:{model_names}') model = copy.deepcopy(model_yolo.model) model.to(device) model.info() for p in model.parameters(): p.requires_grad_(True) model.eval() model.task = task if not hasattr(model, 'end2end'): model.end2end = False if task == 'detect': target = yolo_detect_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'segment': target = yolo_segment_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'pose': target = yolo_pose_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'obb': target = yolo_obb_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'classify': target = yolo_classify_target(backward_type, conf_threshold, ratio, model.end2end) else: raise Exception(f"not support task({task}).") target_layers = [model.model[l] for l in layer] method = eval(method)(model, target_layers) method.activations_and_grads = ActivationsAndGradients(model, target_layers, None) colors = np.random.uniform(0, 255, size=(len(model_names), 3)).astype(np.int32) self.__dict__.update(locals()) def post_process(self, result): result = non_max_suppression(result, conf_thres=self.conf_threshold, iou_thres=0.65)[0] return result def draw_detections(self, box, color, name, img): xmin, ymin, xmax, ymax = list(map(int, list(box))) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), tuple(int(x) for x in color), 2) # 绘制检测框 cv2.putText(img, str(name), (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, tuple(int(x) for x in color), 2, lineType=cv2.LINE_AA) # 绘制类别、置信度 return img def renormalize_cam_in_bounding_boxes(self, boxes, image_float_np, grayscale_cam): """Normalize the CAM to be in the range [0, 1] inside every bounding boxes, and zero outside of the bounding boxes. """ renormalized_cam = np.zeros(grayscale_cam.shape, dtype=np.float32) for x1, y1, x2, y2 in boxes: x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(grayscale_cam.shape[1] - 1, x2), min(grayscale_cam.shape[0] - 1, y2) renormalized_cam[y1:y2, x1:x2] = scale_cam_image(grayscale_cam[y1:y2, x1:x2].copy()) renormalized_cam = scale_cam_image(renormalized_cam) eigencam_image_renormalized = show_cam_on_image(image_float_np, renormalized_cam, use_rgb=True) return eigencam_image_renormalized def process(self, img_path, save_path): # img process try: img = cv2.imdecode(np.fromfile(img_path, np.uint8), cv2.IMREAD_COLOR) except: print(f"Warning... {img_path} read failure.") return img, _, (top, bottom, left, right) = letterbox(img, new_shape=(self.img_size, self.img_size), auto=True) # 如果需要完全固定成宽高一样就把auto设置为False img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 tensor = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) print(f'tensor size:{tensor.size()}') try: grayscale_cam = self.method(tensor, [self.target]) except AttributeError as e: print(f"Warning... self.method(tensor, [self.target]) failure.") return grayscale_cam = grayscale_cam[0, :] cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True) pred = self.model_yolo.predict(tensor, conf=self.conf_threshold, iou=0.7)[0] if self.renormalize and self.task in ['detect', 'segment', 'pose']: cam_image = self.renormalize_cam_in_bounding_boxes(pred.boxes.xyxy.cpu().detach().numpy().astype(np.int32), img, grayscale_cam) if self.show_result: cam_image = pred.plot(img=cam_image, conf=True, # 显示置信度 font_size=None, # 字体大小,None为根据当前image尺寸计算 line_width=None, # 线条宽度,None为根据当前image尺寸计算 labels=False, # 显示标签 ) # 去掉padding边界 cam_image = cam_image[top:cam_image.shape[0] - bottom, left:cam_image.shape[1] - right] cam_image = Image.fromarray(cam_image) cam_image.save(save_path) def __call__(self, img_path, save_path): # remove dir if exist if os.path.exists(save_path): shutil.rmtree(save_path) # make dir if not exist os.makedirs(save_path, exist_ok=True) if os.path.isdir(img_path): for img_path_ in os.listdir(img_path): self.process(f'{img_path}/{img_path_}', f'{save_path}/{img_path_}') else: self.process(img_path, f'{save_path}/result.png') def get_params(): params = { 'weight': 'yolo11n.pt', # 现在只需要指定权重即可,不需要指定cfg 'device': 'cuda:0', 'method': 'GradCAMPlusPlus', # GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM, KPCA_CAM 'layer': [10, 12, 14, 16, 18], 'backward_type': 'all', # detect: segment: pose: obb: classify: 'conf_threshold': 0.2, # 0.2 'ratio': 0.02, # 0.02-0.1 'show_result': True, # 不需要绘制结果请设置为False 'renormalize': False, # 需要把热力图限制在框内请设置为True(仅对detect,segment,pose有效) 'task':'detect', # 任务(detect,segment,pose,obb,classify) 'img_size':640, # 图像尺寸 } return params # pip install grad-cam==1.5.4 --no-deps if __name__ == '__main__': model = yolo_heatmap(**get_params()) model(r'/home/hjj/Desktop/dataset/dataset_coco/coco/images/val2017/000000361238.jpg', 'result') # model(r'/home/hjj/Desktop/dataset/dataset_coco/coco/images/val2017', 'result') ================================================ FILE: yolo-gradcam/yolov9_heatmap.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from models.yolo import Model from utils.augmentations import letterbox from utils.general import xywh2xyxy, non_max_suppression from models.experimental import attempt_load from pytorch_grad_cam import GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM from pytorch_grad_cam.utils.image import show_cam_on_image, scale_cam_image from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targetted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook(self.save_activation)) # Because of https://github.com/pytorch/pytorch/issues/61519, # we don't use backward hook to record gradients. self.handles.append( target_layer.register_forward_hook(self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, input, output): if not hasattr(output, "requires_grad") or not output.requires_grad: # You can only register hooks on tensor requires grad. return # Gradients are computed in reverse order def _store_grad(grad): if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients output.register_hook(_store_grad) def post_process(self, result): logits_ = result[:, 4:] boxes_ = result[:, :4] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], xywh2xyxy(torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]]).cpu().detach().numpy() def __call__(self, x): self.gradients = [] self.activations = [] model_output = self.model(x) post_result, pre_post_boxes, post_boxes = self.post_process(model_output[0]) return [[post_result, pre_post_boxes]] def release(self): for handle in self.handles: handle.remove() class yolov9_target(torch.nn.Module): def __init__(self, ouput_type, conf, ratio) -> None: super().__init__() self.ouput_type = ouput_type self.conf = conf self.ratio = ratio def forward(self, data): post_result, pre_post_boxes = data result = [] for i in trange(int(post_result.size(0) * self.ratio)): if float(post_result[i].max()) < self.conf: break if self.ouput_type == 'class' or self.ouput_type == 'all': result.append(post_result[i].max()) elif self.ouput_type == 'box' or self.ouput_type == 'all': for j in range(4): result.append(pre_post_boxes[i, j]) return sum(result) class yolov9_heatmap: def __init__(self, weight, device, method, layer, backward_type, conf_threshold, ratio, show_box, renormalize): device = torch.device(device) ckpt = torch.load(weight) model_names = ckpt['model'].names model = attempt_load(weight, device) for p in model.parameters(): p.requires_grad_(True) model.eval() target = yolov9_target(backward_type, conf_threshold, ratio) target_layers = [model.model[l] for l in layer] method = eval(method)(model, target_layers, use_cuda=device.type == 'cuda') method.activations_and_grads = ActivationsAndGradients(model, target_layers, None) colors = np.random.uniform(0, 255, size=(len(model_names), 3)).astype(np.int) self.__dict__.update(locals()) def post_process(self, result): result = non_max_suppression(result, conf_thres=self.conf_threshold, iou_thres=0.65)[0] return result def draw_detections(self, box, color, name, img): xmin, ymin, xmax, ymax = list(map(int, list(box))) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), tuple(int(x) for x in color), 2) cv2.putText(img, str(name), (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, tuple(int(x) for x in color), 2, lineType=cv2.LINE_AA) return img def renormalize_cam_in_bounding_boxes(self, boxes, image_float_np, grayscale_cam): """Normalize the CAM to be in the range [0, 1] inside every bounding boxes, and zero outside of the bounding boxes. """ renormalized_cam = np.zeros(grayscale_cam.shape, dtype=np.float32) for x1, y1, x2, y2 in boxes: x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(grayscale_cam.shape[1] - 1, x2), min(grayscale_cam.shape[0] - 1, y2) renormalized_cam[y1:y2, x1:x2] = scale_cam_image(grayscale_cam[y1:y2, x1:x2].copy()) renormalized_cam = scale_cam_image(renormalized_cam) eigencam_image_renormalized = show_cam_on_image(image_float_np, renormalized_cam, use_rgb=True) return eigencam_image_renormalized def process(self, img_path, save_path): # img process img = cv2.imread(img_path) img = letterbox(img)[0] img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 tensor = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) try: grayscale_cam = self.method(tensor, [self.target]) except AttributeError as e: return grayscale_cam = grayscale_cam[0, :] cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True) with torch.no_grad(): pred = self.model(tensor) pred = self.post_process(pred[0]) if self.renormalize: cam_image = self.renormalize_cam_in_bounding_boxes(pred[:, :4].cpu().detach().numpy().astype(np.int32), img, grayscale_cam) if self.show_box: for data in pred: data = data.cpu().detach().numpy() cam_image = self.draw_detections(data[:4], self.colors[int(data[5])], f'{self.model_names[int(data[5])]} {float(data[4]):.2f}', cam_image) cam_image = Image.fromarray(cam_image) cam_image.save(save_path) def __call__(self, img_path, save_path): # remove dir if exist if os.path.exists(save_path): shutil.rmtree(save_path) # make dir if not exist os.makedirs(save_path, exist_ok=True) if os.path.isdir(img_path): for img_path_ in os.listdir(img_path): self.process(f'{img_path}/{img_path_}', f'{save_path}/{img_path_}') else: self.process(img_path, f'{save_path}/result.png') def get_params(): params = { 'weight': 'yolov9-c-converted.pt', 'device': 'cuda:0', 'method': 'XGradCAM', # GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM 'layer': [11, 14, 17], 'backward_type': 'all', # class, box, all 'conf_threshold': 0.2, # 0.6 'ratio': 0.02, # 0.02-0.1 'show_box': True, 'renormalize': False } return params if __name__ == '__main__': model = yolov9_heatmap(**get_params()) model(r'/root/data_ssd/coco17/images', 'result') ================================================ FILE: yolo-improve/CAM.py ================================================ class CAM(nn.Module): def __init__(self, inc, fusion='weight'): super().__init__() assert fusion in ['weight', 'adaptive', 'concat'] self.fusion = fusion self.conv1 = Conv(inc, inc, 3, 1, None, 1, 1) self.conv2 = Conv(inc, inc, 3, 1, None, 1, 3) self.conv3 = Conv(inc, inc, 3, 1, None, 1, 5) self.fusion_1 = Conv(inc, inc, 1) self.fusion_2 = Conv(inc, inc, 1) self.fusion_3 = Conv(inc, inc, 1) if self.fusion == 'adaptive': self.fusion_4 = Conv(inc * 3, 3, 1) def forward(self, x): x1 = self.conv1(x) x2 = self.conv2(x) x3 = self.conv3(x) if self.fusion == 'weight': return self.fusion_1(x1) + self.fusion_2(x2) + self.fusion_3(x3) elif self.fusion == 'adaptive': fusion = torch.softmax(self.fusion_4(torch.cat([self.fusion_1(x1), self.fusion_2(x2), self.fusion_3(x3)], dim=1)), dim=1) x1_weight, x2_weight, x3_weight = torch.split(fusion, [1, 1, 1], dim=1) return x1 * x1_weight + x2 * x2_weight + x3 * x3_weight else: return torch.cat([self.fusion_1(x1), self.fusion_2(x2), self.fusion_3(x3)], dim=1) elif m is CAM: c1, c2 = ch[f], (ch[f] * 3 if args[0] == 'concat' else ch[f]) args = [c1, args[0]] ### yolov5 cam yaml nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [10, 1, CAM, ['weight']], [[-2, -1], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/iou.py ================================================ import numpy as np import torch, math class WIoU_Scale: ''' monotonous: { None: origin v1 True: monotonic FM v2 False: non-monotonic FM v3 } momentum: The momentum of running mean''' iou_mean = 1. monotonous = False _momentum = 1 - 0.5 ** (1 / 7000) _is_train = True def __init__(self, iou): self.iou = iou self._update(self) @classmethod def _update(cls, self): if cls._is_train: cls.iou_mean = (1 - cls._momentum) * cls.iou_mean + \ cls._momentum * self.iou.detach().mean().item() @classmethod def _scaled_loss(cls, self, gamma=1.9, delta=3): if isinstance(self.monotonous, bool): if self.monotonous: return (self.iou.detach() / self.iou_mean).sqrt() else: beta = self.iou.detach() / self.iou_mean alpha = delta * torch.pow(gamma, beta - delta) return beta / alpha return 1 def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, SIoU=False, EIoU=False, WIoU=False, Focal=False, alpha=1, gamma=0.5, scale=False, eps=1e-7): # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) # Get the coordinates of bounding boxes if xywh: # transform from xywh to xyxy (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ else: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clamp(eps) w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clamp(eps) # Intersection area inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \ (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0) # Union Area union = w1 * h1 + w2 * h2 - inter + eps if scale: self = WIoU_Scale(1 - (inter / union)) # IoU # iou = inter / union # ori iou iou = torch.pow(inter/(union + eps), alpha) # alpha iou if CIoU or DIoU or GIoU or EIoU or SIoU or WIoU: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU or EIoU or SIoU or WIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = (cw ** 2 + ch ** 2) ** alpha + eps # convex diagonal squared rho2 = (((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4) ** alpha # center dist ** 2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) with torch.no_grad(): alpha_ciou = v / (v - iou + (1 + eps)) if Focal: return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)), torch.pow(inter/(union + eps), gamma) # Focal_CIoU else: return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)) # CIoU elif EIoU: rho_w2 = ((b2_x2 - b2_x1) - (b1_x2 - b1_x1)) ** 2 rho_h2 = ((b2_y2 - b2_y1) - (b1_y2 - b1_y1)) ** 2 cw2 = torch.pow(cw ** 2 + eps, alpha) ch2 = torch.pow(ch ** 2 + eps, alpha) if Focal: return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2), torch.pow(inter/(union + eps), gamma) # Focal_EIou else: return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2) # EIou elif SIoU: # SIoU Loss https://arxiv.org/pdf/2205.12740.pdf s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5) sin_alpha_1 = torch.abs(s_cw) / sigma sin_alpha_2 = torch.abs(s_ch) / sigma threshold = pow(2, 0.5) / 2 sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1) angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) rho_x = (s_cw / cw) ** 2 rho_y = (s_ch / ch) ** 2 gamma = angle_cost - 2 distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y) omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4) if Focal: return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha), torch.pow(inter/(union + eps), gamma) # Focal_SIou else: return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha) # SIou elif WIoU: if Focal: raise RuntimeError("WIoU do not support Focal.") elif scale: return getattr(WIoU_Scale, '_scaled_loss')(self), (1 - iou) * torch.exp((rho2 / c2)), iou # WIoU https://arxiv.org/abs/2301.10051 else: return iou, torch.exp((rho2 / c2)) # WIoU v1 if Focal: return iou - rho2 / c2, torch.pow(inter/(union + eps), gamma) # Focal_DIoU else: return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area if Focal: return iou - torch.pow((c_area - union) / c_area + eps, alpha), torch.pow(inter/(union + eps), gamma) # Focal_GIoU https://arxiv.org/pdf/1902.09630.pdf else: return iou - torch.pow((c_area - union) / c_area + eps, alpha) # GIoU https://arxiv.org/pdf/1902.09630.pdf if Focal: return iou, torch.pow(inter/(union + eps), gamma) # Focal_IoU else: return iou # IoU ### yolov8 if type(iou) is tuple: if len(iou) == 2: loss_iou = ((1.0 - iou[0]) * iou[1].detach() * weight).sum() / target_scores_sum else: loss_iou = (iou[0] * iou[1] * weight).sum() / target_scores_sum else: loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum ### yolov5 iou = bbox_iou(pbox, tbox[i], CIoU=True) if type(iou) is tuple: if len(iou) == 2: lbox += (iou[1].detach().squeeze() * (1 - iou[0].squeeze())).mean() iou = iou[0].squeeze() else: lbox += (iou[0] * iou[1]).mean() iou = iou[2].squeeze() else: lbox += (1.0 - iou.squeeze()).mean() # iou loss iou = iou.squeeze() ================================================ FILE: yolo-improve/paper.md ================================================ # 基于YOLO和RT-DETR的论文全流程指导项目<此项目全程由E导主导> ### 1. 入手此项目后如果还需要一对一的服务享受会员优惠,此一对一为E导主导 1. 实验方面讲解 268/h (会员248/h) --(拒绝废话纯干货直击痛点) 2. 论文方面讲解 298/h (会员268/h) --(拒绝废话纯干货直击痛点) 一对一业务范畴 ①大论文全程问题都可以 ②小论文全程问题都可以 ③投稿前(帮忙审稿)润色论文及帮看是否符合期刊投稿要求 ④投稿后帮忙修改审稿意见 ⑤其他业务等等 可询问 ### 2. 讲课相关安排 - 1.进群须知: - (1) 从入群时间起,群内会员有效期为一年(一年后如有需要则续费即可) - (2) 1月份建群起开始直播讲课,逐渐直播+直播回放(而不是加群则提前录制好了全部课程) - (3) 讲课方式:qq群课堂or腾讯会议直播(具体群通知)(后进群或没参与直播的可看录屏回放) - (4) 每次直播附带直播答疑服务,每次直播约1-2小时 - (5) 一周至少一次直播课,每次直播会按照以下流程告知讲课内容 - (6) 项目不附带私人答疑服务,群里附带答疑服务,平时我有时间都会回复群里部分问题 - 2. 答疑相关细节: - (1) 直播时答疑:课前excel收集群内近日答疑问题,直播时讲解答疑问题 - (2) 群内日常答疑:群里附带答疑服务,平时我有时间都会回复群里部分问题 - 3. 讲课流程: - (1) 课前 - - 课前 先 提前告知讲课时间 && 收集讲课内容(群投票) - (群投票内容为讲课目录,投票最多的地方则为本次课需要讲解的地方,若无则按顺序讲解) - - 课前 中 选定课程目录后告知讲课内容 - - 课前 后 Excel在线表格收集该内容的相关答疑问题,上课解决(答疑问题时本人必须在场) - (2) 课中(全程1小时左右/每次课) - - 课中 先 直播讲课(按照课前定好的目录) - - 课中 中 总结讲课内容 - - 课中 后 直播答疑(按照课前Excel的收集表,课中弹幕出现的问题)--答疑期间可连麦可互动 - (3) 课后 录制回放发群里,下次讲课时间再定(根据实际情况一周2次以上,上不封顶) - (4) 课后 每周群内某个时间段免费远程解决bug问题(可Todesk远程帮忙解决) - (5) 后续项目内容会逐渐完善,会员福利会逐渐更新补充,敬请期待 ### 3. 论文项目课程目录(每次直播回放视频会对应课程目录内容,提供索引供大家后期检索) 1. 搜论文的几种方式 1.1 谷歌学术 web of science IEEE Springer MDPI ScienceDirection 等等 1.2 一些技巧(查看不能看的论文等等) 2. 如何参考相关论文,关键字搜索--针对性找到自己想要的参考论文 3. 写论文的方法(每个部分的写作逻辑和模版)(①介绍 ②相关工作 ③方法 ④实验 ⑤结论) 3.1 介绍-------只需要确定好课题方向即可开写(实验部分先空着) 3.1.1 写作逻辑和思路讲解 3.1.2 怎么写,该写什么 3.2 相关工作---可能会涉及到数据集,基线模型,三个创新点方面的相关工作 3.2.1 写作逻辑和思路讲解 3.2.2 怎么写,该写什么 3.3 方法-------整体框架+三到四个创新点 3.3.0 写作逻辑和思路讲解 3.3.1 画图(从入门到接近顶会水平) 3.3.2 公式(如何写公式等等) 3.3.3 文字描述创新点(快有快的方法,慢有慢的方法) 3.4 实验 3.4.0 写作逻辑和思路讲解 3.4.1 表格(该做哪些实验,该放哪些指标,授人以鱼并且授人以渔) 3.5 摘要,结论 3.5.0 写作逻辑和思路讲解 3.5.1 总结性内容一次性搞清 3.6 参考文献 3.6.1 如何引用,引用格式 4. 投稿选择(会议 or 期刊) 4.1 EI论文 4.2 CCF论文 4.3 SCI论文---如何筛选自己适合投哪些期刊 4.4 中文核心 or 北大核心 or 学报 5. 论文规范 5.1 审美,格式规范 5.2 论文逻辑严谨 5.3 论文书写有说服力 5.4 投稿前先预审稿 6. 独特技巧经验,高效技巧(讲课过程中会随机穿插小技巧,不过多解释,懂的都懂) 7. 投稿前的一些准备工作,根据期刊等级帮忙查看是否达到发论文的要求(一对一范畴) 8. 硕士毕业大论文书写 9. 持续更新中........ ================================================ FILE: yolo-improve/readme.md ================================================ # YOLO-Improve 这个项目主要是提供一些关于yolo系列模型的改进思路,效果因数据集和参数而异,仅作参考。 # Explanation - **iou** 添加EIOU,SIOU,ALPHA-IOU, FocalEIOU, Wise-IOU到yolov5,yolov8的box_iou中. 1. yolov5 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1KM411b7Sz/). 博客地址:[CSDN](https://blog.csdn.net/qq_37706472/article/details/128737484?spm=1001.2014.3001.5501). #### 2023-2-8 更新: 新增[Wise-IoU](https://arxiv.org/abs/2301.10051) 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1tG4y1N7Gk/). reference:[github](https://github.com/Instinct323/wiou) 2. yolov8 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1PY4y1o7Hm/). 博客地址:[CSDN](https://blog.csdn.net/qq_37706472/article/details/128743012?spm=1001.2014.3001.5502). #### 2023-2-7 更新: 新增[Wise-IoU](https://arxiv.org/abs/2301.10051) 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1De4y1N7Mb/). reference:[github](https://github.com/Instinct323/wiou) - **yolov5-GFPN** 使用DAMO-YOLO中的GFPN替换YOLOV5中的Head. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1iR4y1a7bx/). - **yolov5-C2F** 使用yolov8中的C2F模块替换yolov5中的C3模块.(这个操作比较简单,因此就不提供代码,直接看视频操作一下即可) 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1rx4y1g7xt/). - **yolov7-iou** 添加EIOU,SIOU,ALPHA-IOU, FocalEIOU, Wise-IOU到yolov7的box_iou中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1zx4y177EF/). 博客地址:[CSDN](https://blog.csdn.net/qq_37706472/article/details/128780275?spm=1001.2014.3001.5502). #### 2023-2-11 更新: 新增[Wise-IoU](https://arxiv.org/abs/2301.10051) 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1yv4y147kf/). reference:[github](https://github.com/Instinct323/wiou) - **yolov5-OTA** 添加Optimal Transport Assignment到yolov5的Loss中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1xD4y1J76n/). - **yolov5-DCN** 添加Deformable convolution V2到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1rT411Q76q/). - **yolov8-DCN** 添加Deformable convolution V2到yolov8中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Fo4y1i7Mm/). - **yolov7-DCN** 添加Deformable convolution V2到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV17R4y1q7vr/). - **yolov5-AUX** 添加辅助训练分支到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Fo4y1v7bi/). 原理参考链接:[知乎](https://zhuanlan.zhihu.com/p/588947172) - **CAM** 添加context augmentation module到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV17b411d7ef/). paper:[链接](https://openreview.net/pdf?id=q2ZaVU6bEsT) - **yolov5-SAConv** 添加SAC到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1xD4y1u7NU/). paper:[链接](https://arxiv.org/pdf/2006.02334.pdf) reference: [链接](https://github.com/joe-siyuan-qiao/DetectoRS) - **yolov7-SAConv** 添加SAC到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1xD4y1u7NU/). paper:[链接](https://arxiv.org/pdf/2006.02334.pdf) reference: [链接](https://github.com/joe-siyuan-qiao/DetectoRS) - **yolov5-CoordConv** 添加CoordConv到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1ng4y1E7rS/). reference: [链接](https://blog.csdn.net/qq_35608277/article/details/125257225) - **yolov5-soft-nms** 添加soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU)到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1cM41147Ry/). - **yolov7-CoordConv** 添加CoordConv到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1K54y1g7ye/). reference: [链接](https://blog.csdn.net/qq_35608277/article/details/125257225) - **yolov7-soft-nms** 添加soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU)到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1ZY41167iC/). - **yolov5-DSConv** 添加DSConv到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1iT411a7Mi/). paper: [链接](https://arxiv.org/abs/1901.01928) reference: [链接](https://github.com/ActiveVisionLab/DSConv) - **yolov7-DSConv** 添加DSConv到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1724y1b7PD/). paper: [链接](https://arxiv.org/abs/1901.01928) reference: [链接](https://github.com/ActiveVisionLab/DSConv) - **yolov5-DCNV3** 添加DCNV3到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1LY411z7iE/). 补充事项-视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Dv4y1j7ij/). paper: [链接](https://arxiv.org/abs/2211.05778) reference: [链接](https://github.com/OpenGVLab/InternImage) - **yolov5-NWD** 添加Normalized Gaussian Wasserstein Distance到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1zY4y197UP/). paper: [链接](https://arxiv.org/abs/2110.13389) reference: [链接](https://github.com/jwwangchn/NWD) - **yolov7-DCNV3** 添加DCNV3到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1mk4y1h7us/). paper: [链接](https://arxiv.org/abs/2211.05778) reference: [链接](https://github.com/OpenGVLab/InternImage) - **yolov5-DecoupledHead** 添加Efficient-DecoupledHead到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1mk4y1h7us/). paper: [yolov6链接](https://arxiv.org/pdf/2301.05586.pdf) reference: [链接](https://github.com/meituan/YOLOv6/blob/main/yolov6/models/effidehead.py) - **yolov5-FasterBlock** 添加FasterNet中的Faster-Block到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Bs4y1H7Ph/). paper: [链接](https://arxiv.org/abs/2303.03667) reference: [链接](https://github.com/JierunChen/FasterNet) - **yolov7-NWD** 添加Normalized Gaussian Wasserstein Distance到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1kM411H7g1/). paper: [链接](https://arxiv.org/abs/2110.13389) reference: [链接](https://github.com/jwwangchn/NWD) - **yolov7-DecoupledHead** 添加具有隐式知识学习的Efficient-DecoupledHead到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1tg4y1x7ha/). paper: [yolov6链接](https://arxiv.org/pdf/2301.05586.pdf) [yolor链接](https://arxiv.org/abs/2105.04206) [yolor参考博客](https://blog.csdn.net/AaronYKing/article/details/123804988) reference: [链接](https://github.com/meituan/YOLOv6/blob/main/yolov6/models/effidehead.py) - **yolov5-backbone** 添加Timm支持的主干到yolov5中. 需要安装timm库. 命令: pip install -i https://pypi.tuna.tsinghua.edu.cn/simple timm 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Mx4y1A7jy/). reference: [链接](https://github.com/huggingface/pytorch-image-models#:~:text=I%20missed%20anything.-,Models,-All%20model%20architecture) - **yolov7-PConv** 添加FasterNet中的PConv到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Z84y137oi/). paper: [链接](https://arxiv.org/abs/2303.03667) reference: [链接](https://github.com/JierunChen/FasterNet) - **yolov5-TSCODE** 添加Task-Specific Context Decoupling到yolov5中. 需要安装einops库. 命令: pip install -i https://pypi.tuna.tsinghua.edu.cn/simple einops 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1mk4y1h7us/). paper: [yolov6链接](https://arxiv.org/pdf/2303.01047v1.pdf) - **yolov5-backbone/fasternet** 添加FasterNet主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1ra4y1K77u/). reference: [链接](https://github.com/JierunChen/FasterNet) - **yolov5-backbone/ODConv** 添加Omni-Dimensional Dynamic Convolution主干(od_mobilenetv2,od_resnet)到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Jk4y1v7EW/). reference: [链接](https://github.com/OSVAI/ODConv) - **yolov5-backbone/ODConvFuse** 融合Omni-Dimensional Dynamic Convolution主干(od_mobilenetv2,od_resnet)中的Conv和BN. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Rs4y1N7fp/). - **yolov5-CARAFE** 添加轻量级上采样算子CARAFE到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1kj411c72a/). [yolov7修改视频-哔哩哔哩](https://www.bilibili.com/video/BV1yc411p7wL/). reference: [链接](https://github.com/XiaLiPKU/CARAFE) - **yolov5-EVC** 添加CFPNet中的EVC-Block到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Pg4y1u7cM/). reference: [链接](https://github.com/QY1994-0919/CFPNet) - **yolov5-dyhead** 添加基于注意力机制的目标检测头(DYHEAD)到yolov5中. yolov7版本: [哔哩哔哩](https://www.bilibili.com/video/BV1Ph4y1s7i9/). 安装命令: pip install -U openmim mim install mmengine mim install "mmcv>=2.0.0" 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1qs4y117Mx/). reference: [链接](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/necks/dyhead.py) paper: [链接](https://arxiv.org/abs/2106.08322) - **yolov5-backbone/inceptionnext** 添加(2023年New)InceptionNeXt主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV12v4y1H7E1/). reference: [链接](https://github.com/sail-sg/inceptionnext) paper: [链接](https://arxiv.org/pdf/2303.16900.pdf) - **yolov5-aLRPLoss** 添加aLRPLoss到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1YV4y1Z7rV/). reference: [链接](https://github.com/kemaloksuz/aLRPLoss) paper: [链接](https://arxiv.org/abs/2009.13592) - **yolov5-res2block** 结合Res2Net提出具有多尺度提取能力的C3模块. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV13X4y167VB/). reference: [链接](https://github.com/Res2Net/Res2Net-PretrainedModels) paper: [链接](https://arxiv.org/pdf/1904.01169.pdf) - **yolov7-odconv** 添加Omni-Dimensional Dynamic Convolution到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1vh411j71Z/). reference: [链接](https://github.com/OSVAI/ODConv) - **yolov5-backbone/FocalNet** 添加(2022年)FocalNet(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1ch411L7Dk/). reference: [链接](https://github.com/microsoft/FocalNet) paper: [链接](https://arxiv.org/abs/2203.11926) - **yolov5-backbone/EMO** 添加(2023年)EMO(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Dh4y1J7SV/). reference: [链接](https://github.com/zhangzjn/EMO) paper: [链接](https://arxiv.org/pdf/2301.01146.pdf) - **yolov5-backbone/EfficientFormerV2** 添加(2022年)EfficientFormerV2(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1da4y1g7KT/). reference: [链接](https://github.com/snap-research/EfficientFormer) paper: [链接](https://arxiv.org/pdf/2212.08059.pdf) weight_download: [百度网盘链接](https://pan.baidu.com/s/1I0Ygc3-6fNf2LdIJe290kw?pwd=yvc8) - **yolov5-backbone/PoolFormer** 添加(2022年CVPR)PoolFormer(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1eh411c7bz/). reference: [链接](https://github.com/sail-sg/poolformer) paper: [链接](https://arxiv.org/abs/2111.11418) - **yolov5-backbone/EfficientViT** 添加(2023年)EfficientViT(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1xk4y1L7Gu/). reference: [链接](https://github.com/mit-han-lab/efficientvit) paper: [链接](https://arxiv.org/abs/2205.14756) weight_download: [百度网盘链接](https://pan.baidu.com/s/1dvwuQQBnRCr7aGReY8IEVw?pwd=74ad) - **yolov5-ContextAggregation** 添加ContextAggregation到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Yk4y1s7Kx/). reference: [链接](https://github.com/yeliudev/CATNet) paper: [链接](https://arxiv.org/abs/2111.11057) - **yolov5-backbone/VanillaNet** 添加(2023年)VanillaNet主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1os4y1v7Du/). reference: [链接](https://github.com/huawei-noah/VanillaNet) paper: [链接](https://arxiv.org/abs/2305.12972) weight_download: [百度网盘链接](https://pan.baidu.com/s/1EBAiOtDVMhvQqu2NWoFSIg?pwd=ofx9) - **yolov7-EVC** 添加CFPNet中的EVC-Block到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV12u4y1f7np/). reference: [链接](https://github.com/QY1994-0919/CFPNet) - **yolov7-head** P2,P6检测层在YOLOV7中的添加. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1LX4y1a72m/). - **yolov7-slimneck** 使用VOVGSCSP轻量化yolov7的Neck. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV14m4y147PC/). reference: [链接](https://github.com/AlanLi1997/slim-neck-by-gsconv) - **yolov5-SwinTransformer** 添加SwinTransformer-Tiny主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1WX4y1a7ea/). reference: [链接](https://github.com/microsoft/Swin-Transformer) weight_download: [SwinTransformer-Tiny百度云链接](https://pan.baidu.com/s/1vct0VYwwQQ8PYkBjwSSBZQ?pwd=swin) - **yolov5-NextViT** 添加(2022年)NextViT主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1im4y1i7Ht/). reference: [链接](https://github.com/bytedance/Next-ViT) weight_download: [百度云链接](https://pan.baidu.com/s/18IHKssf9kN8Ej7zIWBKfcw?pwd=houj) - **yolov5-ConvNextV2** 添加(2023年)ConvNextV2主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1es4y1e7b9/). reference: [链接](https://github.com/facebookresearch/ConvNeXt-V2) - **yolov5-RIFormer** 添加(2023年)RIFormer主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1bW4y1X7Lo/). reference: [mmpretrain链接](https://github.com/open-mmlab/mmpretrain/blob/main/mmpretrain/models/backbones/riformer.py) weight_download: [mmpretrain链接](https://github.com/open-mmlab/mmpretrain/tree/main/configs/riformer) - **yolov5-C3RFEM** Scale-Aware RFE与C3结合而成的C3RFEM添加到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Gj411D7Pf/). reference: [链接](https://github.com/Krasjet-Yu/YOLO-FaceV2) - **yolov7-RFEM** Scale-Aware RFE添加到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1hW4y1D7gQ/). reference: [链接](https://github.com/Krasjet-Yu/YOLO-FaceV2) - **yolov5-DBB** 把重参数结构DiverseBranchBlock与C3融合成C3-DBB添加到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1sM4y177Cn/). reference: [链接](https://github.com/DingXiaoH/DiverseBranchBlock) - **yolov7-DBB** 把重参数结构DiverseBranchBlock添加到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV14u411b7kL/). reference: [链接](https://github.com/DingXiaoH/DiverseBranchBlock) - **yolov5-backbone/CVPR2023-EfficientViT** 添加(2023CVPR)EfficientViT(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1xk4y1L7Gu/). reference: [链接](https://github.com/microsoft/Cream/tree/main/EfficientViT) paper: [链接](https://arxiv.org/pdf/2305.07027.pdf) weight: [github链接](https://github.com/xinyuliu-jeffrey/EfficientViT_Model_Zoo/releases/tag/v1.0) - **yolov5-backbone/LSKNet** 添加(2023旋转目标检测SOTA)LSKNet主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1xk4y1L7Gu/). reference: [链接](https://github.com/zcablii/LSKNet) paper: [链接](https://arxiv.org/pdf/2303.09030.pdf) - **yolov5-MPDiou** 添加(2023最新IoU度量算法)MPDiou到yolov5中.(视频教学地址中为详细从头手把手教学,因此本项没有提供代码) 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV19P41147gJ/). paper: [链接](https://arxiv.org/pdf/2307.07662v1.pdf) - **yolov7-MPDiou** 添加(2023最新IoU度量算法)MPDiou到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Qh4y1r7D3/). paper: [链接](https://arxiv.org/pdf/2307.07662v1.pdf) - **yolov5-SlideLoss** 添加Yolo-Face-V2中SlideLoss的到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1W14y1i79U/). reference: [链接](https://github.com/Krasjet-Yu/YOLO-FaceV2/blob/master/utils/loss.py) paper: [链接](https://arxiv.org/abs/2208.02019) - **yolov5-backbone/CVPR2023-RepViT** 添加RepViT(transformer)主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1PH4y1S7mf/). reference: [链接](https://github.com/THU-MIG/RepViT) paper: [链接](https://arxiv.org/abs/2307.09283) - **yolov5-GOLDYOLO** 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进YOLOV5中的特征融合模块. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1PH4y1S7mf/). reference: [链接](https://github.com/huawei-noah/Efficient-Computing/tree/master/Detection/Gold-YOLO) paper: [链接](https://arxiv.org/abs/2309.11331) - **yolov7-GOLDYOLO(文件在yolov5-GOLDYOLO的文件夹中)** 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进YOLOV7中的特征融合模块. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV14V411c7H1/). reference: [链接](https://github.com/huawei-noah/Efficient-Computing/tree/master/Detection/Gold-YOLO) paper: [链接](https://arxiv.org/abs/2309.11331) - **yolov5-DySnakeConv** 利用动态蛇形卷积改进YOLOV5. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Qu411K7Hw/). reference: [链接](https://github.com/YaoleiQi/DSCNet) paper: [链接](https://arxiv.org/abs/2307.08388) - **yolov7-DySnakeConv** 利用动态蛇形卷积改进YOLOV7. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Wj411x7fq/). reference: [链接](https://github.com/YaoleiQi/DSCNet) paper: [链接](https://arxiv.org/abs/2307.08388) - **yolov5-AIFI** 利用带有位置信息编码的AIFI自注意力机制改进YOLOV5. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1nu4y1h7eS/). reference: [链接](https://github.com/lyuwenyu/RT-DETR) paper: [链接](https://arxiv.org/pdf/2304.08069.pdf) - **yolov7-AIFI** 利用带有位置信息编码的AIFI自注意力机制改进YOLOV7. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1rj411a7s4/). reference: [链接](https://github.com/lyuwenyu/RT-DETR) paper: [链接](https://arxiv.org/pdf/2304.08069.pdf) - **yolov5-backbone/UniRepLKNet** 添加UniRepLKNet主干到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1PH4y1S7mf/). reference: [链接](https://github.com/AILab-CVC/UniRepLKNet) paper: [链接](https://arxiv.org/abs/2311.15599) weights-download: [百度云链接](https://pan.baidu.com/s/1Gk48Xa6cWKAVJgsF5cqk1g?pwd=b55v) - **yolov5-asf** 添加Attentional Scale Sequence Fusion到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1kN411V7VZ/). reference: [链接](https://github.com/mkang315/ASF-YOLO) paper: [链接](https://arxiv.org/abs/2312.06458) - **yolov5-ccfm** 添加cross-scale feature-fusion到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Tb4y1P7yd/). reference: [链接](https://github.com/ultralytics/ultralytics) paper: [链接](https://arxiv.org/pdf/2304.08069.pdf) - **yolov7-asf** 添加Attentional Scale Sequence Fusion到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1PH4y1S7mf/). reference: [链接](https://github.com/mkang315/ASF-YOLO) paper: [链接](https://arxiv.org/abs/2312.06458) - **yolov5-RepNCSPELAN** 添加yolov9中的RepNCSPELAN到yolov5中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV17y421z73k/). reference: [链接](https://github.com/WongKinYiu/yolov9) paper: [链接](https://arxiv.org/abs/2402.13616) - **yolov7-RepNCSPELAN** 添加yolov9中的RepNCSPELAN到yolov7中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1UA4m137hz/). reference: [链接](https://github.com/WongKinYiu/yolov9) paper: [链接](https://arxiv.org/abs/2402.13616) - **yolov9-backbone** 添加各种backbone到yolov9中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Ax4y1B7Ln/). - **yolov5-backbone/CVPR2024-StarNet** 添加CVPR2024-StarNet到yolov5、yolov7、yolov9中. 视频教学地址:[哔哩哔哩](https://www.bilibili.com/video/BV1Ax4y1B7Ln/). ================================================ FILE: yolo-improve/rtdetr-compress.md ================================================ # RTDETR剪枝项目介绍 ## 对于群里的剪枝相关问题,我基本都会回复,对于一些剪枝问题,我都会给出建议。 ### 首先剪枝是什么? 模型剪枝是深度学习中的一种技术,旨在通过减少神经网络中不必要的参数和连接,来优化模型的效率和性能。模型剪枝可以分为结构剪枝和参数剪枝两种类型。 ### 为什么需要剪枝? 剪枝可以很好地衡量模型轻量化程度与精度的关系,是替换轻量化结构完全没办法比的,比如我模型剪枝可以压缩百分之30的计算量,精度只下降了百分之1,但是你通过换模块来达到压缩百分之30的计算量,一般时间就会变长,因为大部分轻量化模块都是由时间换空间,而且精度还会下降得比较多,但是剪枝可以很好地避免这个问题. ### 目前剪枝项目包含以下剪枝方法: 1. L1 2. Random 3. Slim(需要稀疏训练) 4. GroupSlim(需要稀疏训练) 5. GroupNorm 6. LAMP 7. GroupSL(需要稀疏训练) 8. GroupReg(需要稀疏训练) 9. GroupHessian 10. GroupTaylor # 对于RTDETR模型,稀疏训练比较难成功,就算能稀疏到模型,掉的精度都比较多,所以我不建议各位使用需要稀疏训练的方法去剪枝,本身RTDETR的训练速度就比较慢,稀疏训练会更加慢一点,所以买剪枝的目的之一一定要需要稀疏训练的方法,那你慎入!!!!! ### 其中prune系列还有一些细节: 1. 支持设定加速比例,模型会进行自动压缩,压缩到指定比例或者达到最大压缩次数后会自动进入finetune。 ### 剪枝的一些顾虑 大家关心最多的一个问题就是,我的结构能不能剪之类的,剪枝对模型复杂度的要求比较高,目前剪枝都是基于Torch_Pruning库进行剪枝,prune系列的可以跳过一些不能剪枝的层(某些复杂的结构可能在构建动态图的时候失败,这些就只能换结构),这个项目会有比较多的示例和视频教程教大家如何去剪自己的结构,注意点在哪里等等。这个剪枝项目是没办法保证所有的结构都能剪,有一定的风险,是否入手请自行考虑! ### 那些人群建议入手剪枝 1. 原始的算法精度很高,没办法再提升精度,只能走轻量化路线,这种建议配合一些轻量化模块+剪枝来增加你的工作量和创新度. 2. 需要部署到嵌入式或者手机端等低算力设备,这类本身模型就不能太复杂,而且以轻量化为主,剪枝是非常适合的. 3. 以后需从事深度学习方面的工作,模型轻量化(蒸馏、量化、剪枝)基本是必须要会的技能. ### RTDETR相关实验 GPU-Device:RTX4090D (以下Model Size为x的实验为我当时记录的数据有点错误,因此直接略掉) #### Dataset:VisDrone2019 Model:RTDETR-R18 | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:8) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 19,884,600 | 57.0 | x | 0.377 | 0.219 | 0.00305s | | LAMP exp1 | 13,458,528(67.7%) | 36.6(64.2%) | x | 0.356(-0.021) | 0.205(-0.014) | 0.00247s(81%) | | LAMP exp2 | 12,279,364(61.7%) | 32.9(57.7%) | x | 0.347(-0.030) | 0.199(-0.020) | 0.00242s(79%) | | LAMP exp3 | 15,729,152(79.1%) | 43.6(76.5%) | x | 0.366(-0.011) | 0.211(-0.008) | 0.00277s(91%) | | LAMP exp4 | 14,321,866(72.0%) | 39.1(68.6%) | x | 0.363(-0.014) | 0.21(-0.009) | 0.00260s(85%) | #### Dataset:CrowdHuman Model:RTDETR-R18 | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:8) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 19,874,328 | 56.9 | x | 0.848 | 0.552 | 0.00306s | | LAMP exp1 | 14,311,594(72.0%) | 39.1(68.7%) | x | 0.837(-0.011) | 0.543(-0.009) | 0.00259s(85%) | #### Dataset:Seaship 20%Training Data Model:RTDETR-R18 | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:8) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 19,879,464 | 57.0 | x | 0.951 | 0.73 | 0.00304s | | LAMP | 7,091,768(35.7%) | 32.1(56.3%) | x | 0.934(-0.017) | 0.73(+0.000) | 0.00239s(79%) | | L1 | 7,712,000(38.8%) | 33.1(58.1%) | x | 0.935(-0.016) | 0.739(+0.009) | 0.00239s(79%) | | GROUP_TAYLOR | 1,3160,368(66.2%) | 31.9(55.9%) | x | 0.942(-0.009) | 0.734(+0.004) | 0.00212s(70%) | | GRAOUP_NORM | 9,752,072(49.0%) | 31.7(55.6%) | x | 0.951(0.000) | 0.74(+0.010) | 0.00228s(75%) | | GRAOUP_HESSIAN | 11,405,392(57.4%) | 31.5(55.3%) | x | 0.94(-0.011) | 0.746(+0.016) | 0.00225s(74%) | ================================================ FILE: yolo-improve/rtdetr-distill.md ================================================ # RTDETR蒸馏项目介绍 ### 首先蒸馏是什么? 模型蒸馏(Model Distillation)是一种用于在计算机视觉中提高模型性能和效率的技术。在模型蒸馏中,通常存在两个模型,即“教师模型”和“学生模型”。 ### 为什么需要蒸馏? 1. 在不增加模型计算量和参数量的情况下提升精度,也即是可以无损提高精度。 2. 论文中的保底手段,因为蒸馏的特殊性,其都不会增加参数量和计算量,可以在最后一个点上大幅度增加实验和工作量,因为本身蒸馏也需要做大量实验。 3. 如果在模型改进过程中进行了轻量化,但是精度降低得有点多,可以尝试使用知识蒸馏来弥补轻量化带来的精度丢失问题。 ### 目前蒸馏方法包含: 1. Logical 1. RTDETRLogicLoss(根据rtdetr的特点进行开发的逻辑蒸馏) 2. RTDETRMutilLogicLoss(根据rtdetr的特点进行开发的逻辑蒸馏) 2. Feature 1. [Mimic](https://openaccess.thecvf.com/content_cvpr_2017/papers/Li_Mimicking_Very_Efficient_CVPR_2017_paper.pdf) 2. [Masked Generative Distillation](https://link.zhihu.com/?target=https%3A//arxiv.org/pdf/2205.01529.pdf) (ECCV 2022) 3. [Channel-wise Distillation](https://arxiv.org/pdf/2011.13256.pdf) (ICCV 2021) 4. [ChSimLoss Distillation](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Exploring_Inter-Channel_Correlation_for_Diversity-Preserved_Knowledge_Distillation_ICCV_2021_paper.html) (ICCV2021) 5. [SPKDLoss Distillation](https://arxiv.org/pdf/1907.09682.pdf) (ICCV2019) ### 知识蒸馏的一些细节(具体项目会提供视频讲解) 1. Feature蒸馏可以自定义选择层进行蒸馏. 2. 蒸馏损失支持常数,线性,余弦进行动调整. 3. 支持Logical和Feature一起使用. 4. 过程中会输出Logical和Feature的损失,让用户可以及时调整对应的损失系数. 5. 支持正常训练模型时候进行蒸馏和剪枝后finetune蒸馏. 6. 支持自蒸馏. 7. 可以利用知识蒸馏压缩模型. # 实验示例结果.(以下示例实验相关命令,视频教程,实验数据都在项目里面) #### Dataset:Visdrone(训练集只用了2500张图,验证集和测试集用了全量的数据) 为了加速实验,老师选择了yolov8s-detr,学生选择了yolov8n-detr | model | GFLOPs | mAP50(test set) | mAP50-95(test set) | | :----: | :----: | :----: | :----: | | yolov8n-detr | 11.7 | 0.266 | 0.146 | | yolov8s-detr | 27.3 | 0.286 | 0.161 | | yolov8n-detr logloss exp1 | 11.7 | 0.272(+0.006) | 0.153(+0.007) | | yolov8n-detr logloss exp2 | 11.7 | 0.278(+0.012) | 0.157(+0.011) | | yolov8n-detr logloss exp3 | 11.7 | 0.271(+0.005) | 0.154(+0.008) | | yolov8n-detr logloss exp4 | 11.7 | 0.282(+0.016) | 0.160(+0.014) | | yolov8n-detr cwd exp1 | 11.7 | 0.255(-0.011) | 0.139(-0.007) | | yolov8n-detr cwd exp2 | 11.7 | 0.267(+0.001) | 0.148(+0.002) | | yolov8n-detr cwd exp3 | 11.7 | 0.268(+0.002) | 0.149(+0.003) | | yolov8n-detr cwd exp4 | 11.7 | 0.261(-0.005) | 0.146(0.000) | | yolov8n-detr cwd exp5 | 11.7 | 0.266(0.000) | 0.147(+0.001) | | yolov8n-detr cwd exp6 | 11.7 | 0.264(-0.002) | 0.146(0.000) | | yolov8n-detr cwd exp7 | 11.7 | 0.260(-0.006) | 0.144(-0.002) | | yolov8n-detr cwd exp8 | 11.7 | 0.268(+0.002) | 0.148(+0.002) | | yolov8n-detr cwd exp9 | 11.7 | 0.269(+0.003) | 0.149(+0.003) | | yolov8n-detr cwd exp10 | 11.7 | 0.267(+0.001) | 0.147(+0.001) | | yolov8n-detr cwd exp11 | 11.7 | 0.257(-0.009) | 0.141(-0.005) | | yolov8n-detr mgd exp1 | 11.7 | 0.271(+0.005) | 0.152(+0.006) | | yolov8n-detr mgd exp2 | 11.7 | 0.265(-0.001) | 0.148(+0.002) | | yolov8n-detr mgd exp3 | 11.7 | 0.269(+0.003) | 0.150(+0.004) | | yolov8n-detr mgd exp4 | 11.7 | 0.265(-0.001) | 0.147(+0.001) | | yolov8n-detr mgd exp5 | 11.7 | 0.264(-0.002) | 0.146(0.000) | | yolov8n-detr mgd exp6 | 11.7 | 0.270(+0.004) | 0.151(+0.005) | | yolov8n-detr mgd exp7 | 11.7 | 0.260(-0.006) | 0.145(-0.001) | | yolov8n-detr mgd exp8 | 11.7 | 0.271(+0.005) | 0.152(+0.006) | | yolov8n-detr shsim exp1 | 11.7 | 0.264(-0.002) | 0.147(+0.001) | | yolov8n-detr shsim exp2 | 11.7 | 0.266(0.000) | 0.148(+0.002) | | yolov8n-detr shsim exp3 | 11.7 | 0.260(-0.006) | 0.143(-0.003) | | yolov8n-detr spkd exp1 | 11.7 | 0.259(-0.007) | 0.143(-0.003) | | yolov8n-detr spkd exp2 | 11.7 | 0.256(-0.010) | 0.142(-0.004) | | yolov8n-detr spkd exp3 | 11.7 | 0.262(-0.004) | 0.145(-0.001) | | yolov8n-detr logloss-mgd exp1 | 11.7 | 0.277(+0.011) | 0.157(+0.011) | | yolov8n-detr logloss-cwd exp1 | 11.7 | 0.274(+0.008) | 0.151(+0.005) | | yolov8n-detr logloss-cwd exp2 | 11.7 | 0.272(+0.006) | 0.153(+0.007) | ================================================ FILE: yolo-improve/rtdetr-project.md ================================================ # [基于Ultralytics的RT-DETR改进详细介绍](https://github.com/z1069614715/objectdetection_script) # 目前自带的一些改进方案(目前拥有合计320+个改进点!持续更新!) # 为了感谢各位对RTDETR项目的支持,本项目的赠品是yolov5-PAGCP通道剪枝算法.[具体使用教程](https://www.bilibili.com/video/BV1yh4y1Z7vz/) # 自带的一些文件说明 1. train.py 训练模型的脚本 2. main_profile.py 输出模型和模型每一层的参数,计算量的脚本(rtdetr-l和rtdetr-x因为thop库的问题,没办法正常输出每一层的参数和计算量和时间) 3. val.py 使用训练好的模型计算指标的脚本 4. detect.py 推理的脚本 5. track.py 跟踪推理的脚本 6. heatmap.py 生成热力图的脚本 7. get_FPS.py 计算模型储存大小、模型推理时间、FPS的脚本 8. get_COCO_metrice.py 计算COCO指标的脚本 9. plot_result.py 绘制曲线对比图的脚本 10. get_model_erf.py 绘制模型的有效感受野.[视频链接](https://www.bilibili.com/video/BV1Gx4y1v7ZZ/) 11. export.py 导出模型脚本 12. test_env.py 验证一些需要编译的或者难安装的(mmcv)是否成功的代码.[百度云链接](https://pan.baidu.com/s/1sWwvN4UC3blBRVe1twrJAg?pwd=bru5) 13. get_all_yaml_param_and_flops.py 计算所有yaml的计算量并排序.[百度云链接](https://pan.baidu.com/s/1ZDzglU7EIzzfaUDhAhagBA?pwd=kg8k) # RT-DETR基准模型 1. ultralytics/cfg/models/rt-detr/rtdetr-r18.yaml(有预训练权重COCO+Objects365,来自RTDETR-Pytorch版本的移植) rtdetr-r18 summary: 421 layers, 20184464 parameters, 20184464 gradients, 58.6 GFLOPs 2. ultralytics/cfg/models/rt-detr/rtdetr-r34.yaml(有预训练权重COCO,来自RTDETR-Pytorch版本的移植) rtdetr-r34 summary: 525 layers, 31441668 parameters, 31441668 gradients, 90.6 GFLOPs 3. ultralytics/cfg/models/rt-detr/rtdetr-r50-m.yaml(有预训练权重COCO,来自RTDETR-Pytorch版本的移植) rtdetr-r50-m summary: 637 layers, 36647020 parameters, 36647020 gradients, 98.3 GFLOPs 4. ultralytics/cfg/models/rt-detr/rtdetr-r50.yaml(有预训练权重COCO+Objects365,来自RTDETR-Pytorch版本的移植) rtdetr-r50 summary: 629 layers, 42944620 parameters, 42944620 gradients, 134.8 GFLOPs 5. ultralytics/cfg/models/rt-detr/rtdetr-r101.yaml rtdetr-r101 summary: 867 layers, 76661740 parameters, 76661740 gradients, 257.7 GFLOPs 6. ultralytics/cfg/models/rt-detr/rtdetr-l.yaml(有预训练权重) rtdetr-l summary: 673 layers, 32970732 parameters, 32970732 gradients, 108.3 GFLOPs 7. ultralytics/cfg/models/rt-detr/rtdetr-x.yaml(有预训练权重) rtdetr-x summary: 867 layers, 67468108 parameters, 67468108 gradients, 232.7 GFLOPs # 专栏改进汇总 ### 二次创新系列 1. ultralytics/cfg/models/rt-detr/rtdetr-DCNV2-Dynamic.yaml 使用自研可变形卷积DCNV2-Dynamic改进resnet18-backbone中的BasicBlock.(详细介绍请看百度云视频-MPCA与DCNV2_Dynamic的说明) 2. ultralytics/cfg/models/rt-detr/rtdetr-iRMB-Cascaded.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)中的CascadedGroupAttention对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进resnet18-backbone中的BasicBlock.(详细介绍请看百度云视频-20231119更新说明) 3. ultralytics/cfg/models/rt-detr/rtdetr-PConv-Rep.yaml 使用[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv对[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv进行二次创新后改进resnet18-backbone中的BasicBlock. 4. ultralytics/cfg/models/rt-detr/rtdetr-Faster-Rep.yaml 使用[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv对[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新后改进resnet18-backbone中的BasicBlock. 5. ultralytics/cfg/models/rt-detr/rtdetr-Faster-EMA.yaml 使用[EMA ICASSP2023](https://arxiv.org/abs/2305.13563v1)对[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新后改进resnet18-backbone中的BasicBlock. 6. ultralytics/cfg/models/rt-detr/rtdetr-Faster-Rep-EMA.yaml 使用[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv和[EMA ICASSP2023](https://arxiv.org/abs/2305.13563v1)对[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新后改进resnet18-backbone中的BasicBlock. 7. ultralytics/cfg/models/rt-detr/rtdetr-DWRC3-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)进行二次创新改进rtdetr. 8. ultralytics/cfg/models/rt-detr/rtdetr-ASF-P2.yaml 在ultralytics/cfg/models/rt-detr/rtdetr-ASF.yaml的基础上进行二次创新,引入P2检测层并对网络结构进行优化. 9. ultralytics/cfg/models/rt-detr/rtdetr-slimneck-ASF.yaml 使用[SlimNeck](https://github.com/AlanLi1997/slim-neck-by-gsconv)中的VoVGSCSP\VoVGSCSPC和GSConv和[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion改进rtdetr中的CCFM. 10. ultralytics/cfg/models/rt-detr/rtdetr-goldyolo-asf.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute和[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行改进特征融合模块. 11. ultralytics/cfg/models/rt-detr/rtdetr-HSPAN.yaml 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进RTDETR中的CCFM. 12. ultralytics/cfg/models/rt-detr/rtdetr-ASF-Dynamic.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion的上采样模块得到Dynamic Sample Attentional Scale Sequence Fusion改进CCFM. 13. ultralytics/cfg/models/rt-detr/rtdetr-iRMB-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进resnet18-backbone中的BasicBlock. 14. ultralytics/cfg/models/rt-detr/rtdetr-iRMB-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进resnet18-backbone中的BasicBlock. 15. ultralytics/cfg/models/rt-detr/rtdetr-DBBNCSPELAN.yaml 在rtdetr-RepNCSPELAN.yaml使用[Diverse Branch Block CVPR2021](https://arxiv.org/abs/2103.13425)进行二次创新.(详细介绍请看百度云视频-20240225更新说明) 16. ultralytics/cfg/models/rt-detr/rtdetr-OREPANCSPELAN.yaml 在rtdetr-RepNCSPELAN.yaml使用[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main)进行二次创新.(详细介绍请看百度云视频-20240225更新说明) 17. ultralytics/cfg/models/rt-detr/rtdetr-DRBNCSPELAN.yaml 在rtdetr-RepNCSPELAN.yaml使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock进行二次创新.(详细介绍请看百度云视频-20240225更新说明) 18. ultralytics/cfg/models/rt-detr/rtdetr-Conv3XCNCSPELAN.yaml 在rtdetr-RepNCSPELAN.yaml使用[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的Conv3XC进行二次创新.(详细介绍请看百度云视频-20240225更新说明) 19. ultralytics/cfg/models/rt-detr/rtdetr-ELA-HSFPN.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN. 20. ultralytics/cfg/models/rt-detr/rtdetr-CA-HSFPN.yaml 使用[Coordinate Attention CVPR2021](https://github.com/houqb/CoordAttention)改进HSFPN. 21. ultralytics/cfg/models/rt-detr/rtdetr-RepNCSPELAN-CAA.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块改进RepNCSPELAN. 22. ultralytics/cfg/models/rt-detr/rtdetr-CAA-HSFPN.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块HSFPN. 23. ultralytics/cfg/models/rt-detr/rtdetr-CAFMFusion.yaml 利用具有[HCANet](https://github.com/summitgao/HCANet)中的CAFM,其具有获取全局和局部信息的注意力机制进行二次改进content-guided attention fusion. 24. ultralytics/cfg/models/rt-detr/rtdetr-faster-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU对CVPR2023中的FasterNet进行二次创新. 25. ultralytics/cfg/models/rt-detr/rtdetr-bifpn-GLSA.yaml 使用[GLSA](https://github.com/Barrett-python/DuAT)模块对bifpn进行二次创新. 26. ultralytics/cfg/models/rt-detr/rtdetr-BIMAFPN.yaml 利用BIFPN的思想对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次改进得到BIMAFPN. 27. ultralytics/cfg/models/rt-detr/rtdetr-C2f-AddutuveBlock-CGLU.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU和CSP思想改进backbone. 28. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MSMHSA-CGLU.yaml 使用[CMTFNet](https://github.com/DrWuHonglin/CMTFNet/tree/main)中的M2SA和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进c2f. 29. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SHSA-CGLU.yaml 使用[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSABlock与[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU和CSP思想改进backbone. 30. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SMAFB-CGLU.yaml 使用[SMAFormer BIBM2024](https://github.com/CXH-Research/SMAFormer)中的SMAFormerBlock与[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进与CSP思想改进backbone. 31. ultralytics/cfg/models/rt-detr/rtdetr-MAN-Faster.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新改进rtdetr. 32. ultralytics/cfg/models/rt-detr/rtdetr-MAN-FasterCGLU.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进rtdetr. 33. ultralytics/cfg/models/rt-detr/rtdetr-MAN-Star.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock进行二次创新改进rtdetr. 34. ultralytics/cfg/models/rt-detr/rtdetr-MutilBackbone-MSGA.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate对自研系列MutilBackbone再次创新. 35. ultralytics/cfg/models/rt-detr/rtdetr-slimneck-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade对slimneck二次创新. 36. ultralytics/cfg/models/rt-detr/rtdetr-CDFA.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的WaveletConv与[AAAI2025 ConDSeg](https://github.com/Mengqi-Lei/ConDSeg)的ContrastDrivenFeatureAggregation结合改进rtdetr. 37. ultralytics/cfg/models/rt-detr/rtdetr-C2f-StripCGLU.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU与CSP结合改进backbone. 38. ultralytics/cfg/models/rt-detr/rtdetr-C2f-ELGCA-CGLU.yaml 使用[ELGC-Net](https://github.com/techmn/elgcnet)中的ELGCA和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU与CSP结合改进backbone. 39. ultralytics/cfg/models/rt-detr/rtdetr-C2f-Faster-KAN.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN对(CVPR2023)fasternet中的FastetBlock进行二次创新. 40. ultralytics/cfg/models/11/yolo11-C3k2-DIMB-KAN.yaml 在ultralytics/cfg/models/rt-detr/rtdetr-C2f-DIMB.yaml的基础上把mlp模块换成[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN. 41. ultralytics/cfg/models/rt-detr/rtdetr-C2f-EfficientVIM-CGLU.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU与CSP结合改进backbone. 42. ultralytics/cfg/models/rt-detr/rtdetr-EUCB-SC.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB和[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix改进rtdetr-r18的上采样. 43. ultralytics/cfg/models/rt-detr/rtdetr-EMBSFPN-SC.yaml 在ultralytics/cfg/models/rt-detr/rtdetr-EMBSFPN.yaml方案上引入[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix. 44. ultralytics/cfg/models/rt-detr/rtdetr-Pola-CGLU.yaml 使用[ICLR2025 PolaFormer](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新. 45. ultralytics/cfg/models/rt-detr/rtdetr-Pola-FMFFN.yaml 使用[ICLR2025 PolaFormer](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[ICLR2024-FTIC](https://github.com/qingshi9974/ICLR2024-FTIC)中的的FMFFN进行二次创新. 46. ultralytics/cfg/models/rt-detr/rtdetr-MFMMAFPN.yaml 利用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次改进得到MFMMAFPN. 47. ultralytics/cfg/models/rt-detr/rtdetr-HyperCompute-MFM.yaml 利用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的Hypergraph Computation in Semantic Space进行二次创新. 48. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-ASSA-SEFN.yaml 使用[CVPR2024 Adapt or Perish: Adaptive Sparse Transformer with Attentive Feature Refinement for Image Restoration](https://openaccess.thecvf.com/content/CVPR2024/papers/Zhou_Adapt_or_Perish_Adaptive_Sparse_Transformer_with_Attentive_Feature_Refinement_CVPR_2024_paper.pdf)中的Adaptive Sparse Self-Attention与[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)改进AIFI. 49. ultralytics/cfg/models/rt-detr/rtdetr-Pola-SEFN.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)改进AIFI. 50. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-ASSA-SEFN-Mona.yaml 使用[CVPR2024 Adapt or Perish: Adaptive Sparse Transformer with Attentive Feature Refinement for Image Restoration](https://openaccess.thecvf.com/content/CVPR2024/papers/Zhou_Adapt_or_Perish_Adaptive_Sparse_Transformer_with_Attentive_Feature_Refinement_CVPR_2024_paper.pdf)中的Adaptive Sparse Self-Attention与[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进AIFI. 51. ultralytics/cfg/models/rt-detr/rtdetr-Pola-SEFN-Mona.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进AIFI. 52. ultralytics/cfg/models/rt-detr/rtdetr-C2f-mambaout-LSConv.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C2f. 53. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-ASSA-SEFN-Mona-DyT.yaml 使用[CVPR2024 Adapt or Perish: Adaptive Sparse Transformer with Attentive Feature Refinement for Image Restoration](https://openaccess.thecvf.com/content/CVPR2024/papers/Zhou_Adapt_or_Perish_Adaptive_Sparse_Transformer_with_Attentive_Feature_Refinement_CVPR_2024_paper.pdf)中的Adaptive Sparse Self-Attention与[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进和[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan改进AIFI. 54. ultralytics/cfg/models/rt-detr/rtdetr-Pola-SEFN-Mona-DyT.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进和[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan改进AIFI. 55. ultralytics/cfg/models/rt-detr/rtdetr-Pola-SEFFN-Mona-DyT.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[TransMamba](https://github.com/sunshangquan/TransMamba)的SpectralEnhancedFFN和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进和[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan改进AIFI. 56. ultralytics/cfg/models/rt-detr/rtdetr-Pola-EDFFN-Mona-DyT.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention与[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进和[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan改进AIFI. 57. ultralytics/cfg/models/rt-detr/rtdetr-C2f-mambaout-FDConv.yaml 使用[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv和[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进BackBone. 58. ultralytics/cfg/models/rt-detr/rtdetr-C2f-PFDConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv与[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv二次创新后改进BackBone. 59. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FasterFDConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的FasterBlock与[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv二次创新后改进BackBone. 60. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DSAN-EDFFN.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention Block和[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN进行二次创新后改进BackBone. 61. ultralytics/cfg/models/rt-detr/rtdetr-C2f-mambaout-DSA.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention Block与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进BackBone. 62. ultralytics/cfg/models/rt-detr/rtdetr-SOEP-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE对原创改进SOEP再次创新. 63. ultralytics/cfg/models/rt-detr/rtdetr-SOEP-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对原创改进SOEP再次创新. 64. ultralytics/cfg/models/rt-detr/rtdetr-SOEP-MFM-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE和[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对原创改进SOEP再次创新. 65. ultralytics/cfg/models/rt-detr/rtdetr-C2f-mambaout-SFSC.yaml 使用[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C2f. 66. ultralytics/cfg/models/rt-detr/rtdetr-C2f-PSFSConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv与[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv二次创新后改进C2f. 67. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FasterSFSConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的FasterBlock与[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv二次创新后改进C2f. 68. ultralytics/cfg/models/rt-detr/rtdetr-SOEP-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer对原创改进SOEP进行创新. 69. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DIMB-HyperACE.yaml 使用[yolo13](https://github.com/iMoonLab/yolov13)中的HyperACE与自研模块DynamicInceptionDWConv2d的结合. 70. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-SHSA-EPGO.yaml 使用[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO和[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSA改进AIFI. 71. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SHSA-EPGO.yaml 使用[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSABlock与[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO和CSP思想改进backbone. 72. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SHSA-EPGO-CGLU.yaml 使用[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSABlock与[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU与[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO和CSP思想改进backbone. ### 自研系列 1. ultralytics/cfg/models/rt-detr/rtdetr-PACAPN.yaml 自研结构, Parallel Atrous Convolution Attention Pyramid Network, PAC-APN 1. 并行(上/下)采样分支可为网络提供多条特征提取途径,丰富特征表达的多样性、再结合gate机制对采样后的特征进行特征选择,强化更有意义的特征,抑制冗余或不相关的特征,提升特征表达的有效性。 2. PAC模块通过使用具有不同膨胀率的并行空洞卷积,能够有效地提取不同尺度的特征。这使得网络能够捕捉数据中局部和上下文信息,提高其表示复杂模式的能力。 2. ultralytics/cfg/models/rt-detr/rtdetr-FDPN.yaml 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 1. 通过定制的特征聚焦模块与特征扩散机制,能让每个尺度的特征都具有详细的上下文信息,更有利于后续目标的检测与分类。 2. 定制的特征聚焦模块可以接受三个尺度的输入,其内部包含一个Inception-Style的模块,其利用一组并行深度卷积来捕获丰富的跨多个尺度的信息。 3. 通过扩散机制使具有丰富的上下文信息的特征进行扩散到各个检测尺度. 3. ultralytics/cfg/models/rt-detr/rtdetr-FDPN-DASI.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Dimension-Aware Selective Integration Module对自研的Focusing Diffusion Pyramid Network再次创新. 4. ultralytics/cfg/models/rt-detr/rtdetr-RGCSPELAN.yaml 自研RepGhostCSPELAN. 1. 参考GhostNet中的思想(主流CNN计算的中间特征映射存在广泛的冗余),采用廉价的操作生成一部分冗余特征图,以此来降低计算量和参数量。 2. 舍弃yolov5与yolov8中常用的BottleNeck,为了弥补舍弃残差块所带来的性能损失,在梯度流通分支上使用RepConv,以此来增强特征提取和梯度流通的能力,并且RepConv可以在推理的时候进行融合,一举两得。 3. 可以通过缩放因子控制RGCSPELAN的大小,使其可以兼顾小模型和大模型。 5. ultralytics/cfg/models/rt-detr/rtdetr-ContextGuideFPN.yaml Context Guide Fusion Module(CGFM)是一个创新的特征融合模块,旨在改进YOLOv8中的特征金字塔网络(FPN)。该模块的设计考虑了多尺度特征融合过程中上下文信息的引导和自适应调整。 1. 上下文信息的有效融合:通过SE注意力机制,模块能够在特征融合过程中捕捉并利用重要的上下文信息,从而增强特征表示的有效性,并有效引导模型学习检测目标的信息,从而提高模型的检测精度。 2. 特征增强:通过权重化的特征重组操作,模块能够增强重要特征,同时抑制不重要特征,提升特征图的判别能力。 3. 简单高效:模块结构相对简单,不会引入过多的计算开销,适合在实时目标检测任务中应用。 这期视频讲解在B站:https://www.bilibili.com/video/BV1Vx4y1n7hZ/ 6. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SMPCGLU.yaml Self-moving Point Convolutional GLU模型改进C2f. SMP来源于[CVPR2023-SMPConv](https://github.com/sangnekim/SMPConv),Convolutional GLU来源于[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt). 1. 普通的卷积在面对数据中的多样性和复杂性时,可能无法捕捉到有效的特征,因此我们采用了SMPConv,其具备最新的自适应点移动机制,从而更好地捕捉局部特征,提高特征提取的灵活性和准确性。 2. 在SMPConv后添加CGLU,Convolutional GLU 结合了卷积和门控机制,能够选择性地通过信息通道,提高了特征提取的有效性和灵活性。 7. Re-CalibrationFPN 为了加强浅层和深层特征的相互交互能力,推出重校准特征金字塔网络(Re-CalibrationFPN). P2345:ultralytics/cfg/models/v8/yolov8-ReCalibrationFPN-P2345.yaml(带有小目标检测头的ReCalibrationFPN) P345:ultralytics/cfg/models/v8/yolov8-ReCalibrationFPN-P345.yaml P3456:ultralytics/cfg/models/v8/yolov8-ReCalibrationFPN-P3456.yaml(带有大目标检测头的ReCalibrationFPN) 1. 浅层语义较少,但细节丰富,有更明显的边界和减少失真。此外,深层蕴藏着丰富的物质语义信息。因此,直接融合低级具有高级特性的特性可能导致冗余和不一致。为了解决这个问题,我们提出了[SBA](https://github.com/Barrett-python/DuAT)模块,它有选择地聚合边界信息和语义信息来描绘更细粒度的物体轮廓和重新校准物体的位置。 2. 相比传统的FPN结构,[SBA](https://github.com/Barrett-python/DuAT)模块引入了高分辨率和低分辨率特征之间的双向融合机制,使得特征之间的信息传递更加充分,进一步提升了多尺度特征融合的效果。 3. [SBA](https://github.com/Barrett-python/DuAT)模块通过自适应的注意力机制,根据特征图的不同分辨率和内容,自适应地调整特征的权重,从而更好地捕捉目标的多尺度特征。 8. ultralytics/cfg/models/rt-detr/rtdetr-SOEP.yaml 小目标在正常的P3、P4、P5检测层上略显吃力,比较传统的做法是加上P2检测层来提升小目标的检测能力,但是同时也会带来一系列的问题,例如加上P2检测层后计算量过大、后处理更加耗时等问题,日益激发需要开发新的针对小目标有效的特征金字塔,我们基于原本的PAFPN上进行改进,提出SmallObjectEnhancePyramid,相对于传统的添加P2检测层,我们使用P2特征层经过SPDConv得到富含小目标信息的特征给到P3进行融合,然后使用CSP思想和基于[AAAI2024的OmniKernel](https://ojs.aaai.org/index.php/AAAI/article/view/27907)进行改进得到CSP-OmniKernel进行特征整合,OmniKernel模块由三个分支组成,即三个分支,即全局分支、大分支和局部分支、以有效地学习从全局到局部的特征表征,最终从而提高小目标的检测性能。 9. ultralytics/cfg/models/rt-detr/rtdetr-CGRFPN.yaml Context-Guided Spatial Feature Reconstruction Feature Pyramid Network. 1. 借鉴[ECCV2024-CGRSeg](https://github.com/nizhenliang/CGRSeg)中的Rectangular Self-Calibration Module经过精心设计,用于空间特征重建和金字塔上下文提取,它在水平和垂直方向上捕获全局上下文,并获得轴向全局上下文来显式地建模矩形关键区域. 2. PyramidContextExtraction Module使用金字塔上下文提取模块(PyramidContextExtraction),有效整合不同层级的特征信息,提升模型的上下文感知能力。 3. FuseBlockMulti 和 DynamicInterpolationFusion 这些模块用于多尺度特征的融合,通过动态插值和多特征融合,进一步提高了模型的多尺度特征表示能力和提升模型对复杂背景下目标的识别能力。 10. ultralytics/cfg/models/rt-detr/rtdetr-EMBSFPN.yaml 基于BIFPN、[MAF-YOLO](https://arxiv.org/pdf/2407.04381)、[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)提出全新的Efficient Multi-Branch&Scale FPN. Efficient Multi-Branch&Scale FPN拥有<轻量化>、<多尺度特征加权融合>、<多尺度高效卷积模块>、<高效上采样模块>、<全局异构核选择机制>。 1. 具有多尺度高效卷积模块和全局异构核选择机制,Trident网络的研究表明,具有较大感受野的网络更适合检测较大的物体,反之,较小尺度的目标则从较小的感受野中受益,因此我们在FPN阶段,对于不同尺度的特征层选择不同的多尺度卷积核以适应并逐步获得多尺度感知场信息。 2. 借鉴BIFPN中的多尺度特征加权融合,能把Concat换成Add来减少参数量和计算量的情况下,还能通过不同尺度特征的重要性进行自适用选择加权融合。 3. 高效上采样模块来源于CVPR2024-EMCAD中的EUCB,能够在保证一定效果的同时保持高效性。 11. ultralytics/cfg/models/rt-detr/rtdetr-CSP-PMSFA.yaml 自研模块:CSP-Partial Multi-Scale Feature Aggregation. 1. 部分多尺度特征提取:参考CVPR2020-GhostNet、CVPR2024-FasterNet的思想,采用高效的PartialConv,该模块能够从输入中提取多种尺度的特征信息,但它并不是在所有通道上进行这种操作,而是部分(Partial)地进行,从而提高了计算效率。 2. 增强的特征融合: 最后的 1x1 卷积层通过将不同尺度的特征融合在一起,同时使用残差连接将输入特征与处理后的特征相加,有效保留了原始信息并引入了新的多尺度信息,从而提高模型的表达能力。 12. ultralytics/cfg/models/rt-detr/rtdetr-MutilBackbone-DAF.yaml 自研MutilBackbone-DynamicAlignFusion. 1. 为了避免在浅层特征图上消耗过多计算资源,设计的MutilBackbone共享一个stem的信息,这个设计有利于避免计算量过大,推理时间过大的问题。 2. 为了避免不同Backbone信息融合出现不同来源特征之间的空间差异,我们为此设计了DynamicAlignFusion,其先通过融合来自两个不同模块学习到的特征,然后生成一个名为DynamicAlignWeight去调整各自的特征,最后使用一个可学习的通道权重,其可以根据输入特征动态调整两条路径的权重,从而增强模型对不同特征的适应能力。 13. ultralytics/cfg/models/rt-detr/rtdetr-CSP-MutilScaleEdgeInformationEnhance.yaml 自研CSP-MutilScaleEdgeInformationEnhance. MutilScaleEdgeInformationEnhance模块结合了多尺度特征提取、边缘信息增强和卷积操作。它的主要目的是从不同尺度上提取特征,突出边缘信息,并将这些多尺度特征整合到一起,最后通过卷积层输出增强的特征。这个模块在特征提取和边缘增强的基础上有很好的表征能力. 1. 多尺度特征提取:通过 nn.AdaptiveAvgPool2d 进行多尺度的池化,提取不同大小的局部信息,有助于捕捉图像的多层次特征。 2. 边缘增强:EdgeEnhancer 模块专门用于提取边缘信息,使得网络对边缘的敏感度增强,这对许多视觉任务(如目标检测、语义分割等)有重要作用。 3. 特征融合:将不同尺度下提取的特征通过插值操作对齐到同一尺度,然后将它们拼接在一起,最后经过卷积层融合成统一的特征表示,能够提高模型对多尺度特征的感知。 14. ultralytics/cfg/models/rt-detr/rtdetr-CSP-FreqSpatial.yaml FreqSpatial 是一个融合时域和频域特征的卷积神经网络(CNN)模块。该模块通过在时域和频域中提取特征,旨在捕捉不同层次的空间和频率信息,以增强模型在处理图像数据时的鲁棒性和表示能力。模块的主要特点是将 Scharr 算子(用于边缘检测)与 时域卷积 和 频域卷积 结合,通过多种视角捕获图像的结构特征。 1. 时域特征提取:从原始图像中提取出基于空间结构的特征,主要捕捉图像的细节、边缘信息等。 2. 频域特征提取:从频率域中提取出频率相关的模式,捕捉到图像的低频和高频成分,能够帮助模型在全局和局部的尺度上提取信息。 3. 特征融合:将时域和频域的特征进行加权相加,得到最终的输出特征图。这种加权融合允许模型同时考虑空间结构信息和频率信息,从而增强模型在多种场景下的表现能力。 15. ultralytics/cfg/models/rt-detr/rtdetr-CSP-MutilScaleEdgeInformationSelect.yaml 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新. 我们提出了一个 多尺度边缘信息选择模块(MutilScaleEdgeInformationSelect),其目的是从多尺度边缘信息中高效选择与目标任务高度相关的关键特征。为了实现这一目标,我们引入了一个具有通过聚焦更重要的区域能力的注意力机制[ICCV2023 DualDomainSelectionMechanism, DSM](https://github.com/c-yn/FocalNet)。该机制通过聚焦图像中更重要的区域(如复杂边缘和高频信号区域),在多尺度特征中自适应地筛选具有更高任务相关性的特征,从而显著提升了特征选择的精准度和整体模型性能。 16. GlobalEdgeInformationTransfer 总所周知,物体框的定位非常之依赖物体的边缘信息,但是对于常规的目标检测网络来说,没有任何组件能提高网络对物体边缘信息的关注度,我们需要开发一个能让边缘信息融合到各个尺度所提取的特征中,因此我们提出一个名为GlobalEdgeInformationTransfer(GEIT)的模块,其可以帮助我们把浅层特征中提取到的边缘信息传递到整个backbone上,并与不同尺度的特征进行融合。 1. 由于原始图像中含有大量背景信息,因此从原始图像上直接提取边缘信息传递到整个backbone上会给网络的学习带来噪声,而且浅层的卷积层会帮助我们过滤不必要的背景信息,因此我们选择在网络的浅层开发一个名为MutilScaleEdgeInfoGenetator的模块,其会利用网络的浅层特征层去生成多个尺度的边缘信息特征图并投放到主干的各个尺度中进行融合。 2. 对于下采样方面的选择,我们需要较为谨慎,我们的目标是保留并增强边缘信息,同时进行下采样,选择MaxPool 会更合适。它能够保留局部区域的最强特征,更好地体现边缘信息。因为 AvgPool 更适用于需要平滑或均匀化特征的场景,但在保留细节和边缘信息方面的表现不如 MaxPool。 3. 对于融合部分,ConvEdgeFusion巧妙地结合边缘信息和普通卷积特征,提出了一种新的跨通道特征融合方式。首先,使用conv_channel_fusion进行边缘信息与普通卷积特征的跨通道融合,帮助模型更好地整合不同来源的特征。然后采用conv_3x3_feature_extract进一步提取融合后的特征,以增强模型对局部细节的捕捉能力。最后通过conv_1x1调整输出特征维度。 17. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DIMB.yaml 自研模块DynamicInceptionDWConv2d.(更多解释请看项目内的使用教程.md) 18. ultralytics/cfg/models/rt-detr/rtdetr-HAFB-1.yaml 自研模块Hierarchical Attention Fusion Block.(更多解释请看项目内的使用教程.md) 19. ultralytics/cfg/models/rt-detr/rtdetr-HAFB-2.yaml HAFB的另外一种使用方式. 20. ultralytics/cfg/models/rt-detr/rtdetr-MutilBackbone-HAFB.yaml 在rtdetr-MutilBackbone-DAF.yaml上引入HAFB(Hierarchical Attention Fusion Block). ### BackBone系列 1. ultralytics/cfg/models/rt-detr/rt-detr-timm.yaml 使用[timm](https://github.com/huggingface/pytorch-image-models)库系列的主干替换rtdetr的backbone.(基本支持现有CNN模型) 2. ultralytics/cfg/models/rt-detr/rt-detr-fasternet.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)替换rtdetr的backbone. 3. ultralytics/cfg/models/rt-detr/rt-detr-EfficientViT.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)替换rtdetr的backbone. 4. ultralytics/cfg/models/rt-detr/rtdetr-convnextv2.yaml 使用[ConvNextV2 2023](https://github.com/facebookresearch/ConvNeXt-V2)替换rtdetr的backbone. 5. ultralytics/cfg/models/rt-detr/rtdetr-EfficientFormerv2.yaml 使用[EfficientFormerv2 2022](https://github.com/snap-research/EfficientFormer)替换rtdetr的backbone. 6. ultralytics/cfg/models/rt-detr/rtdetr-repvit.yaml 使用[RepViT ICCV2023](https://github.com/THU-MIG/RepViT)替换rtdetr的backbone. 7. ultralytics/cfg/models/rt-detr/rtdetr-CSwomTramsformer.yaml 使用[CSwinTramsformer CVPR2022](https://github.com/microsoft/CSWin-Transformer)替换rtdetr的backbone. 8. ultralytics/cfg/models/rt-detr/rtdetr-VanillaNet.yaml 使用[VanillaNet 2023](https://github.com/huawei-noah/VanillaNet)替换rtdetr的backbone. 9. ultralytics/cfg/models/rt-detr/rtdetr-SwinTransformer.yaml 使用[SwinTransformer ICCV2021](https://github.com/microsoft/Swin-Transformer)替换rtdetr的backbone. 10. ultralytics/cfg/models/rt-detr/rtdetr-lsknet.yaml 使用[LSKNet ICCV2023](https://github.com/zcablii/LSKNet)替换rtdetr的backbone. 11. ultralytics/cfg/models/rt-detr/rt-detr-unireplknet.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)替换rtdetr的backbone. 12. ultralytics/cfg/models/rt-detr/rtdetr-TransNeXt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)改进rtdetr的backbone. 13. ultralytics/cfg/models/rt-detr/rtdetr-RepNCSPELAN.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN和ADown进行改进RTDETR-R18. 14. ultralytics/cfg/models/rt-detr/rtdetr-rmt.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)改进rtdetr的主干. 15. ultralytics/cfg/models/rt-detr/rtdetr-C2f-PKI.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的PKIModule和CAA模块和C2f改进backbone. 16. ultralytics/cfg/models/rt-detr/rtdetr-C2f-PPA.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Parallelized Patch-Aware Attention Module改进C2f. 17. ultralytics/cfg/models/rt-detr/rtdetr-mobilenetv4.yaml 使用[MobileNetV4](https://github.com/jaiwei98/MobileNetV4-pytorch/tree/main)改进rtdetr-backbone. 18. ultralytics/cfg/models/rt-detr/rtdetr-starnet.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)改进yolov8-backbone. 19. ultralytics/cfg/models/rt-detr/rtdetr-C2f-vHeat.yaml 使用[vHeat](https://github.com/MzeroMiko/vHeat/tree/main)中的HeatBlock和C2f改进backbone. 20. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FMB.yaml 使用[ECCV2024 SMFANet](https://github.com/Zheng-MJ/SMFANet/tree/main)的Feature Modulation block改进C2f. 21. ultralytics/cfg/models/rt-detr/rtdetr-C2f-gConv.yaml 使用[Rethinking Performance Gains in Image Dehazing Networks](https://arxiv.org/abs/2209.11448)的gConvblock改进C2f. 22. ultralytics/cfg/models/rt-detr/rtdetr-C2f-AddutuveBlock.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock和CSP思想改进backbone. 23. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MogaBlock.yaml 使用[MogaNet ICLR2024](https://github.com/Westlake-AI/MogaNet)中的MogaBlock与CSP思想结合改进backbone. 24. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SHSA.yaml 使用[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSABlock和CSP思想改进backbone. 25. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SMAFB.yaml 使用[SMAFormer BIBM2024](https://github.com/CXH-Research/SMAFormer)中的SMAFormerBlock与CSP思想改进backbone. 26. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FFCM.yaml 使用[Efficient Frequency-Domain Image Deraining with Contrastive Regularization ECCV2024](https://github.com/deng-ai-lab/FADformer)中的Fused_Fourier_Conv_Mixer与CSP思想结合改进rtdetr-backbone. 27. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SFHF.yaml 使用[SFHformer ECCV2024](https://github.com/deng-ai-lab/SFHformer)中的block与CSP思想结合改进 rtdetr-backbone. 28. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MSM.yaml 使用[Revitalizing Convolutional Network for Image Restoration TPAMI2024](https://zhuanlan.zhihu.com/p/720777160)中的MSM与CSP思想结合改进rtdetr-backbone. 29. ultralytics/cfg/models/rt-detr/rtdetr-C2f-HDRAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的HDRAB(hybrid dilated residual attention block)结合CSP思想改进backbone. 30. ultralytics/cfg/models/rt-detr/rtdetr-C2f-RAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的RAB( residual attention block)结合CSP思想改进backbone. 31. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FCA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention与CSP结合改进backbone. 32. ultralytics/cfg/models/rt-detr/rtdetr-C2f-CAMixer.yaml 使用[CAMixerSR CVPR2024](https://github.com/icandle/CAMixerSR)中的CAMixer与CSP结合改进backbone. 33. ultralytics/cfg/models/rt-detr/rtdetr-C2f-HFERB.yaml 使用[ICCV2023 CRAFT-SR](https://github.com/AVC2-UESTC/CRAFT-SR)中的high-frequency enhancement residual block与CSP结合改进backbone. 34. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DTAB.yaml 使用[AAAI2025 TBSN](https://github.com/nagejacob/TBSN)中的DTAB与CSP结合改进backbone. 35. ultralytics/cfg/models/rt-detr/rtdetr-C2f-JDPM.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的joint domain perception module与CSP结合改进backbone. 36. ultralytics/cfg/models/rt-detr/rtdetr-C2f-ETB.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的entanglement transformer block与CSP结合改进backbone. 37. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FDT.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Full-domain Transformer与CSP结合改进backbone. 38. ultralytics/cfg/models/rt-detr/rtdetr-C2f-AP.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Asymmetric Padding bottleneck改进rtdetr. 39. ultralytics/cfg/models/rt-detr/rtdetr-C2f-ELGCA.yaml 使用[ELGC-Net](https://github.com/techmn/elgcnet)中的ELGCA与CSP结合改进backbone. 40. ultralytics/cfg/models/rt-detr/rtdetr-C2f-Strip.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock与CSP结合改进backbone. 41. ultralytics/cfg/models/rt-detr/rtdetr-C2f-KAT.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAT与CSP结合改进backbone. 42. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GlobalFilter.yaml 使用[T-PAMI Global Filter Networks for Image Classification](https://github.com/raoyongming/GFNet)中的GlobalFilterBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU和CSP改进rtdetr-backbone. 43. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DynamicFilter.yaml 使用[AAAI2024 FFT-Based Dynamic Token Mixer for Vision](https://github.com/okojoalg/dfformer)中的DynamicFilter与CSP改进rtdetr-backbone. 44. ultralytics/cfg/models/rt-detr/rtdetr-RepHMS.yaml 使用[MHAF-YOLO](https://github.com/yang-0201/MHAF-YOLO)中的RepHMS改进rtdetr. 45. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SAVSS.yaml 使用[CVPR2025 SCSegamba](https://github.com/Karl1109/SCSegamba)中的Structure-Aware Scanning Strategy与CSP结合改进backbone. 46. ultralytics/cfg/models/rt-detr/rtdetr-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOut替换BackBone. 47. ultralytics/cfg/models/rt-detr/rtdetr-C2f-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOut与CSP结合改进backbone. 48. ultralytics/cfg/models/rt-detr/rtdetr-C2f-EfficientVIM.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock与CSP结合改进backbone. 49. ultralytics/cfg/models/rt-detr/rtdetr-C2f-IEL.yaml 使用[CVPR2025 HVI](https://github.com/Fediory/HVI-CIDNet)中的Intensity Enhancement Layer与CSP改进rtdetr中的BackBone. 50. ultralytics/cfg/models/rt-detr/rtdetr-overlock.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的overlock-backbone替换rtdetr-r18的backbone. 51. ultralytics/cfg/models/rt-detr/rtdetr-C2f-RCB.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的RepConvBlock与CSP改进rtdetr-r18的backbone. 52. ultralytics/cfg/models/rt-detr/rtdetr-C2f-LEGM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的LEGM与CSP改进rtdetr-r18的backbone. 53. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FAT.yaml 使用[ICLR2024-FTIC](https://github.com/qingshi9974/ICLR2024-FTIC)中的FATBlock与CSP改进rtdetr-r18的backbone. 54. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MobileMamba.yaml 使用使用[CVPR2025 MobileMamba](https://github.com/lewandofskee/MobileMamba)中的MobileMambaBlock与CSP思想改进backbone. 55. ultralytics/cfg/models/rt-detr/rtdetr-MobileMamba.yaml 使用[CVPR2025 MobileMamba](https://github.com/lewandofskee/MobileMamba)中的MobileMamba改进Backbone. 56. ultralytics/cfg/models/rt-detr/rtdetr-C2f-LFEM.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LFEModule与CSP思想改进backbone. 57. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SBSM.yaml 使用[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Snake Bi-Directional Sequence Modelling (SBSM)与CSP思想改进backbone. 58. ultralytics/cfg/models/rt-detr/rtdetr-lsnet.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSNet替换backbone. 59. ultralytics/cfg/models/rt-detr/rtdetr-C2f-LSBlock.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSBlock改进C2f. 60. ultralytics/cfg/models/rt-detr/rtdetr-C2f-TransMamba.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的TransMamba与CSP思想改进backbone. 61. ultralytics/cfg/models/rt-detr/rtdetr-C2f-EVS.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EVS与CSP思想改进backbone. 62. ultralytics/cfg/models/rt-detr/rtdetr-C2f-EBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的EVS与CSP思想改进backbone. 63. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的EVS与CSP思想改进backbone. 64. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FDConv.yaml 使用[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv与CSP思想改进BackBone. 65. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DSAN.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention Block与CSP改进BackBone. 66. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DSA.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention与CSP改进BackBone. 67. ultralytics/cfg/models/rt-detr/rtdetr-C2f-RMB.yaml 使用[CVPR2025 MaIR](https://github.com/XLearning-SCU/2025-CVPR-MaIR)中的Residual Mamba Block与CSP思想改进BackBone. 68. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SFSConv.yaml 使用[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv改进C2f. 69. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GroupMamba.yaml 使用[CVPR2025 GroupMamba](https://github.com/Amshaker/GroupMamba)中的GroupMambaLayer与CSP思想改进Backbone. 70. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GroupMambaBlock.yaml 使用[CVPR2025 GroupMamba](https://github.com/Amshaker/GroupMamba)中的GroupMambaBlock与CSP思想改进Backbone. 71. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MambaVision.yaml 使用[CVPR2025 MambaVision](https://github.com/NVlabs/MambaVision)中的MambaVision与CSP思想改进Backbone. 72. ultralytics/cfg/models/rt-detr/rtdetr-FCM.yaml 使用[AAAI2025 FBRT-YOLO](https://github.com/galaxy-oss/FCM)的模块改进rtdetr. 73. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进C2f. 74. ultralytics/cfg/models/rt-detr/rtdetr-C2f-wConv.yaml 使用[weightedConvolution2.0](https://github.com/cammarasana123/weightedConvolution2.0)中的wConv2d改进C2f. 75. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GLVSS.yaml 使用[TGRS2025 UMFormer](https://github.com/takeyoutime/UMFormer)中的GLVSS与CSP改进backbone. 76. ultralytics/cfg/models/rt-detr/rtdetr-C2f-ESC.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ESC与CSP改进backbone. 77. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MBRConv3.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv3与CSP改进backbone. 78. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MBRConv5.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv5与CSP改进backbone. 79. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MBRConv3.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv3与CSP改进backbone. 80. ultralytics/cfg/models/rt-detr/rtdetr-C2f-VSSD.yaml 使用[ICCV2025 VSSD](https://github.com/YuHengsss/VSSD)中的VSSD与CSP改进backbone. 81. ultralytics/cfg/models/rt-detr/rtdetr-C2f-TVIM.yaml 使用[ICCV2025 TinyVIM](https://arxiv.org/abs/2411.17473)中的TinyVIMBlock与CSP改进backbone. 82. ultralytics/cfg/models/rt-detr/rtdetr-C2f-CSI.yaml 使用[INFFUS2025 SAMamba](https://arxiv.org/pdf/2505.23214)中的CSI与C2f改进Backbone. 83. ultralytics/cfg/models/rt-detr/rtdetr-C2f-ConvAttn.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ConvAttn与CSP改进Backbone. 84. ultralytics/cfg/models/rt-detr/rtdetr-C2f-UniConvBlock.yaml 使用[ICCV2025 UniConvBlock](https://github.com/ai-paperwithcode/UniConvNet)中的UniConvBlock与CSP思想改进backbone. 85. ultralytics/cfg/models/rt-detr/rtdetr-C2f-LGLB.yaml 使用[ACM MM 2025 Mobile U-ViT](https://github.com/FengheTan9/Mobile-U-ViT)中的LGLBBlock与CSP思想改进backbone. 86. ultralytics/cfg/models/rt-detr/rtdetr-C2f-ConverseB.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的ConverseBlock与CSP思想改进backbone. 87. ultralytics/cfg/models/rt-detr/rtdetr-C2f-Converse2D.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的Converse2D与CSP思想改进backbone. 88. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv与CSP改进backbone. 89. ultralytics/cfg/models/rt-detr/rtdetr-C2f-CFBlock.yaml 使用[AAAI2024 SCTNet](https://arxiv.org/pdf/2312.17071)中的CFBlock与CSP改进backbone. 90. ultralytics/cfg/models/rt-detr/rtdetr-C2f-FMABlock.yaml 使用[IJCV2024 SRConvNet](https://github.com/lifengcs/SRConvNet)中的FMABlock与CSP思想改进backbone. 91. ultralytics/cfg/models/rt-detr/rtdetr-C2f-LWGA.yaml 使用[LWGANet](https://github.com/lwCVer/LWGANet)中的LWGABlock与CSP思想改进backbone. 92. ultralytics/cfg/models/rt-detr/rtdetr-C2f-CSSC.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CSSC与CSP改进backbone. 93. ultralytics/cfg/models/rt-detr/rtdetr-C2f-CNCM.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CNCM与CSP改进backbone. 94. ultralytics/cfg/models/rt-detr/rtdetr-C2f-HFRB.yaml 使用[ICCV2025 HFRB](https://arxiv.org/pdf/2507.10689)中的HFRB与CSP改进backbone. 95. ultralytics/cfg/models/rt-detr/rtdetr-C2f-EVA.yaml 使用[ICIP2025 BEVANET](https://arxiv.org/pdf/2508.07300)中的EVA与CSP改进backbone. 96. ultralytics/cfg/models/rt-detr/rtdetr-C2f-RMBC.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv与CSP改进backbone. 97. ultralytics/cfg/models/rt-detr/rtdetr-C2f-RMBC-LA.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv、Local Importance-based Attention与CSP改进backbone. 98. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SFMB.yaml 使用[TIP2025 SFMB](https://arxiv.org/pdf/2511.06593v1)中的SFMB与CSP改进backbone. 99. ultralytics/cfg/models/rt-detr/rtdetr-ESMoE.yaml 使用[YOLO-Master](https://github.com/isLinXu/YOLO-Master)中的ES-MoE模块改进RTDETR. 100. ultralytics/cfg/models/rt-detr/rtdetr-FAENet.yaml 使用[TGRS2025 MASFNet](https://ieeexplore.ieee.org/document/10955257)中的FAENet增强输入图像的特征. 101. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MFEB.yaml 使用[MICCAI2023 SHISRCNet](https://arxiv.org/abs/2306.14119)中的MFEB与CSP改进Backbone. 102. ultralytics/cfg/models/rt-detr/rtdetr-C2f-PartialNetBlock.yaml 使用[AAAI2026 Partial Channel Network](https://arxiv.org/pdf/2502.01303)中的PartialNetBlock与CSP改进Backbone. 103. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DGR.yaml 使用[TGRS2025 DRPCA-Net](https://arxiv.org/pdf/2507.09541)中的DRG与CSP改进Backbone. 104. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GLGM.yaml 使用[TGRS2025 ISGLNet](https://ieeexplore.ieee.org/document/11232501)中的GLGM与CSP改进Backbone. 105. ultralytics/cfg/models/rt-detr/rtdetr-C2f-MAC.yaml 使用[TGRS2025 HDNet](https://ieeexplore.ieee.org/document/11232501)中的MAC与CSP改进Backbone. 106. ultralytics/cfg/models/rt-detr/rtdetr-C2f-SPJFB.yaml 使用[AAAI2026 SPJFNet](https://arxiv.org/pdf/2508.04041)中的SPJFBlock与CSP改进Backbone. 107. ultralytics/cfg/models/rt-detr/rtdetr-C2f-GLSS2D.yaml 使用[TGRS2025 GLVMamba](https://ieeexplore.ieee.org/document/11014226)中的GLSS2D与CSP改进Backbone. 108. ultralytics/cfg/models/rt-detr/rtdetr-C2f-DEGConv.yaml 使用[CVPR2026 MixerCSeg](https://arxiv.org/pdf/2603.01361)中的DEGConv与CSP改进Backbone. 109. ultralytics/cfg/models/rt-detr/rtdetr-C2f-TransMixer.yaml 使用[CVPR2026 MixerCSeg](https://arxiv.org/pdf/2603.01361)中的TransMixer与CSP改进Backbone. ### AIFI系列 1. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-LPE.yaml 使用LearnedPositionalEncoding改进AIFI中的位置编码生成.(详细介绍请看百度云视频-20231119更新说明) 2. ultralytics/cfg/models/rt-detr/rtdetr-CascadedGroupAttention.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)中的CascadedGroupAttention改进rtdetr中的AIFI.(详细请看百度云视频-rtdetr-CascadedGroupAttention说明) 3. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-DAttention.yaml 使用[Vision Transformer with Deformable Attention CVPR2022](https://github.com/LeapLabTHU/DAT)中的DAttention改进AIFI. 4. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-HiLo.yaml 使用[LITv2](https://github.com/ziplab/LITv2)中具有提取高低频信息的高效注意力对AIFI进行二次改进. 5. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-EfficientAdditive.yaml 使用[ICCV2023 SwiftFormer](https://github.com/Amshaker/SwiftFormer/tree/main)中的EfficientAdditiveAttention改进AIFI. 6. ultralytics/cfg/models/rt-detr/rtdetr-AIFIRepBN.yaml 使用[ICML-2024 SLAB](https://github.com/xinghaochen/SLAB)中的RepBN改进AIFI. 7. ultralytics/cfg/models/rt-detr/rtdetr-AdditiveTokenMixer.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock改进AIFI. 8. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-MSMHSA.yaml 使用[CMTFNet](https://github.com/DrWuHonglin/CMTFNet/tree/main)中的M2SA改进AIFI. 9. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-DHSA.yaml 使用[Histoformer ECCV2024](https://github.com/sunshangquan/Histoformer)中的Dynamic-range Histogram Self-Attention改进AIFI. 10. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-DPB.yaml 使用[CrossFormer](https://arxiv.org/pdf/2108.00154)中的DynamicPosBias-Attention改进AIFI. 11. ultralytics/cfg/models/rt-detr/rtdetr-DTAB.yaml 使用[AAAI2025 TBSN](https://github.com/nagejacob/TBSN)中的DTAB替换AIFI. 12. ultralytics/cfg/models/rt-detr/rtdetr-ETB.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的entanglement transformer block替换AIFI. 13. ultralytics/cfg/models/rt-detr/rtdetr-FDT.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Full-domain Transformer替换AIFI. 14. ultralytics/cfg/models/rt-detr/rtdetr-Pola.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention改进AIFI. 15. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-TSSA.yaml 使用[Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention改进AIFI. 16. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-ASSA.yaml 使用[CVPR2024 Adapt or Perish: Adaptive Sparse Transformer with Attentive Feature Refinement for Image Restoration](https://openaccess.thecvf.com/content/CVPR2024/papers/Zhou_Adapt_or_Perish_Adaptive_Sparse_Transformer_with_Attentive_Feature_Refinement_CVPR_2024_paper.pdf)中的Adaptive Sparse Self-Attention改进AIFI. 17. ultralytics/cfg/models/rt-detr/rtdetr-ASSR.yaml 使用[CVPR2025 MambaIR](https://github.com/csguoh/MambaIR)中的Attentive State Space Group改进rtdetr. 18. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-SEFN.yaml 使用[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)改进AIFI. 19. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-DyT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan改进AIFI. 20. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-SEFFN.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的SpectralEnhancedFFN改进AIFI. 21. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-EDFFN.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN改进AIFI. 22. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-MSLA.yaml 使用[MSLA](https://arxiv.org/pdf/2505.18823)改进AIFI. 23. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-EPGO.yaml 使用[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO改进AIFI. 24. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-SHSA.yaml 使用[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSA改进AIFI. 25. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-DML.yaml 使用[IJCV2024 SRConvNet](https://github.com/lifengcs/SRConvNet)中的DMI改进AIFI. 26. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-LRSA.yaml 使用[TPAMI2025 LRFormer](https://mmcheng.net/wp-content/uploads/2025/06/25PAMI_LRFormer.pdf)中的LRSA改进AIFI. 27. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-MALA.yaml 使用[ICCV2025 Rectifying Magnitude Neglect in Linear Attention](https://arxiv.org/pdf/2507.00698)中的MALA改进AIFI. 28. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-EGSA.yaml 使用[ACMMM2025 FlickCD](https://dl.acm.org/doi/epdf/10.1145/3746027.3755657)中的EGSA改进AIFI. 29. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-SWSA.yaml 使用[ACMMM2025 FlickCD](https://dl.acm.org/doi/epdf/10.1145/3746027.3755657)中的SWSA改进AIFI. 30. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-DWMMSA.yaml 使用[TIP2025 DSMT](https://ieeexplore.ieee.org/document/10955125)中的DWMMSA改进AIFI. 31. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-BinaryAttn.yaml 使用[CVPR2026 BinaryAttention](https://arxiv.org/abs/2602.00701)中的BinaryAttention改进AIFI. 32. ultralytics/cfg/models/rt-detr/rtdetr-AIFI-WCA.yaml 使用[CVPR2025 Wavelet and Prototype Augmented Query-based Transformer for Pixel-level Surface Defect Detection](https://openaccess.thecvf.com/content/CVPR2025/papers/Yan_Wavelet_and_Prototype_Augmented_Query-based_Transformer_for_Pixel-level_Surface_Defect_CVPR_2025_paper.pdf)中的WCA改进AIFI. ### Neck系列 1. ultralytics/cfg/models/rt-detr/rtdetr-ASF.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion来改进rtdetr. 2. ultralytics/cfg/models/rt-detr/rtdetr-slimneck.yaml 使用[SlimNeck](https://github.com/AlanLi1997/slim-neck-by-gsconv)中的VoVGSCSP\VoVGSCSPC和GSConv改进rtdetr中的CCFM. 3. ultralytics/cfg/models/rt-detr/rtdetr-SDI.yaml 使用[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对CCFM中的feature fusion进行改进. 4. ultralytics/cfg/models/rt-detr/rtdetr-goldyolo.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块. 5. ultralytics/cfg/models/rt-detr/rtdetr-HSFPN.yaml 使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进RTDETR中的CCFM. 6. ultralytics/cfg/models/rt-detr/rtdetr-bifpn.yaml 添加BIFPN到rtdetr-r18中. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持四种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode block模块选择,具体可看对应百度云视频-20240302更新公告. 3. head_channel BIFPN中的通道数,默认设置为256. 7. ultralytics/cfg/models/rt-detr/rtdetr-CSFCN.yaml 使用[Context and Spatial Feature Calibration for Real-Time Semantic Segmentation](https://github.com/kaigelee/CSFCN/tree/main)中的Context and Spatial Feature Calibration模块改进rtdetr-neck. 8. ultralytics/cfg/models/rt-detr/rtdetr-CGAFusion.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的content-guided attention fusion改进rtdetr-neck. 9. ultralytics/cfg/models/rt-detr/rtdetr-SDFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的superficial detail fusion module改进rtdetr-neck. 10. ultralytics/cfg/models/rt-detr/rtdetr-PSFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的profound semantic fusion module改进yolov8-neck. 11. ultralytics/cfg/models/rt-detr/rtdetr-GLSA.yaml 使用[GLSA](https://github.com/Barrett-python/DuAT)模块改进rtdetr的neck. 12. ultralytics/cfg/models/rt-detr/rtdetr-CTrans.yaml 使用[[AAAI2022] UCTransNet](https://github.com/McGregorWwww/UCTransNet/tree/main)中的ChannelTransformer改进rtdetr-neck. 13. ultralytics/cfg/models/rt-detr/rtdetr-p6-CTrans.yaml 使用[[AAAI2022] UCTransNet](https://github.com/McGregorWwww/UCTransNet/tree/main)中的ChannelTransformer改进rtdetr-neck.(带有p6版本) 14. ultralytics/cfg/models/rt-detr/rtdetr-MAFPN.yaml 使用[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN改进Neck. 15. Cross-Layer Feature Pyramid Transformer. P345:ultralytics/cfg/models/rt-detr/rtdetr-CFPT.yaml P3456:ultralytics/cfg/models/rt-detr/rtdetr-CFPT-P3456.yaml 使用[CFPT](https://github.com/duzw9311/CFPT/tree/main)改进neck. 16. ultralytics/cfg/models/rt-detr/rtdetr-FreqFFPN.yaml 使用[FreqFusion TPAMI2024](https://github.com/Linwei-Chen/FreqFusion)中的FreqFusion改进Neck.(这个需要python3.10,不然最后保存模型会出错.) 17. ultralytics/cfg/models/rt-detr/rtdetr-msga.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate改进rtdetr-neck. 18. ultralytics/cfg/models/rt-detr/rtdetr-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade改进rtdetr-neck. 19. ultralytics/cfg/models/rt-detr/rtdetr-mpcafsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention改进rtdetr-neck. 20. ultralytics/cfg/models/rt-detr/rtdetr-fsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention改进rtdetr. 21. ultralytics/cfg/models/rt-detr/rtdetr-CAB.yaml 使用[CVPR2025 HVI](https://github.com/Fediory/HVI-CIDNet)中的CAB改进rtdetr中的特征融合. 22. ultralytics/cfg/models/rt-detr/rtdetr-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM改进neck. 23. ultralytics/cfg/models/rt-detr/rtdetr-GDSAFusion.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的GDSAFusion改进Fusion. 24. ultralytics/cfg/models/rt-detr/rtdetr-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer改进rtdetr-r18. 25. ultralytics/cfg/models/rt-detr/rtdetr-HS-FPN.yaml 使用[AAAI2025 HS-FPN](https://github.com/ShiZican/HS-FPN/tree/main)中的HFP和SDP改进rtdetr-neck. 26. ultralytics/cfg/models/rt-detr/rtdetr-HyperACE.yaml 使用[yolo13](https://github.com/iMoonLab/yolov13)中的HyperACE改进rtdetr-neck. 27. ultralytics/cfg/models/rt-detr/rtdetr-DPCF.yaml 使用[INFFUS2025 SAMamba](https://arxiv.org/pdf/2505.23214)中的DPCF改进rtdetr-neck. 28. ultralytics/cfg/models/rt-detr/rtdetr-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE改进rtdetr-neck. 29. ultralytics/cfg/models/rt-detr/rtdetr-LCA.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的LCA改进rtdetr-neck. 30. ultralytics/cfg/models/rt-detr/rtdetr-HFFE.yaml 使用[TGRS2025 HAFNet](https://ieeexplore.ieee.org/document/11154006)中的HFFE改进rtdetr-neck. 31. ultralytics/cfg/models/rt-detr/rtdetr-MFPM.yaml 使用[TGRS2025 ISGLNet](https://ieeexplore.ieee.org/document/11232501)中的MFPM改进特征融合. 32. ultralytics/cfg/models/rt-detr/rtdetr-ERM.yaml 使用[TGRS2025 ISGLNet](https://ieeexplore.ieee.org/document/11232501)中的ERM改进特征融合. 33. ultralytics/cfg/models/rt-detr/rtdetr-CAFM.yaml 使用[TIP2025 DSMT](https://ieeexplore.ieee.org/document/10955125)中的CAFM改进rtdetr-neck. ### Head系列 1. ultralytics/cfg/models/rt-detr/rtdetr-p2.yaml 添加小目标检测头P2到TransformerDecoderHead中. ### RepC3改进系列 1. ultralytics/cfg/models/rt-detr/rtdetr-DWRC3.yaml 使用[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)模块构建DWRC3改进rtdetr. 2. ultralytics/cfg/models/rt-detr/rtdetr-Conv3XCC3.yaml 使用[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的Conv3XC改进RepC3. 3. ultralytics/cfg/models/rt-detr/rtdetr-DRBC3.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock改进RepC3. 4. ultralytics/cfg/models/rt-detr/rtdetr-DBBC3.yaml 使用[DiverseBranchBlock CVPR2021](https://github.com/DingXiaoH/DiverseBranchBlock)改进RepC3. 5. ultralytics/cfg/models/rt-detr/rtdetr-DGCST.yaml 使用[Lightweight Object Detection](https://arxiv.org/abs/2403.01736)中的Dynamic Group Convolution Shuffle Transformer改进rtdetr-r18. 6. ultralytics/cfg/models/rt-detr/rtdetr-DGCST2.yaml 使用[Lightweight Object Detection](https://arxiv.org/abs/2403.01736)中的Dynamic Group Convolution Shuffle Transformer与Dynamic Group Convolution Shuffle Module进行结合改进rtdetr-r18. 7. ultralytics/cfg/models/rt-detr/rtdetr-RetBlockC3.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)中的RetBlock改进RepC3. 8. ultralytics/cfg/models/rt-detr/rtdetr-KANC3.yaml 使用[Pytorch-Conv-KAN](https://github.com/IvanDrokin/torch-conv-kan)的KAN卷积算子改进RepC3. 目前支持: 1. FastKANConv2DLayer 2. KANConv2DLayer 3. KALNConv2DLayer 4. KACNConv2DLayer 5. KAGNConv2DLayer 9. ultralytics/cfg/models/rt-detr/rtdetr-gConvC3.yaml 使用[Rethinking Performance Gains in Image Dehazing Networks](https://arxiv.org/abs/2209.11448)的gConvblock改进RepC3. 10. ultralytics/cfg/models/rt-detr/rtdetr-LFEC3.yaml 使用[Efficient Long-Range Attention Network for Image Super-resolution ECCV2022](https://github.com/xindongzhang/ELAN)中的Local feature extraction改进RepC3. 11. ultralytics/cfg/models/rt-detr/rtdetr-IELC3.yaml 使用[CVPR2025 HVI](https://github.com/Fediory/HVI-CIDNet)中的Intensity Enhancement Layer改进rtdetr中的RepC3. 12. ultralytics/cfg/models/rt-detr/rtdetr-FDConvC3.yaml 使用[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv改进RepC3. 13. ultralytics/cfg/models/rt-detr/rtdetr-MBRConv3C3.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv3改进RepC3. 14. ultralytics/cfg/models/rt-detr/rtdetr-MBRConv5C3.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv5改进RepC3. 15. ultralytics/cfg/models/rt-detr/rtdetr-Converse2DC3.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的Converse2D改进RepC3. ### ResNet主干中的BasicBlock/BottleNeck改进系列(以下改进BottleNeck基本都有,就不再重复标注) 1. ultralytics/cfg/models/rt-detr/rtdetr-Ortho.yaml 使用[OrthoNets](https://github.com/hady1011/OrthoNets/tree/main)中的正交通道注意力改进resnet18-backbone中的BasicBlock.(详细介绍请看百度云视频-20231119更新说明) 2. ultralytics/cfg/models/rt-detr/rtdetr-DCNV2.yaml 使用可变形卷积DCNV2改进resnet18-backbone中的BasicBlock. 3. ultralytics/cfg/models/rt-detr/rtdetr-DCNV3.yaml 使用可变形卷积[DCNV3 CVPR2023](https://github.com/OpenGVLab/InternImage)改进resnet18-backbone中的BasicBlock.(安装教程请看百度云视频-20231119更新说明) 4. ultralytics/cfg/models/rt-detr/rtdetr-iRMB.yaml 使用[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB改进resnet18-backbone中的BasicBlock.(详细介绍请看百度云视频-20231119更新说明) 5. ultralytics/cfg/models/rt-detr/rtdetr-DySnake.yaml 添加[DySnakeConv](https://github.com/YaoleiQi/DSCNet)到resnet18-backbone中的BasicBlock中. 6. ultralytics/cfg/models/rt-detr/rtdetr-PConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv改进resnet18-backbone中的BasicBlock. 7. ultralytics/cfg/models/rt-detr/rtdetr-Faster.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block改进resnet18-backbone中的BasicBlock. 8. ultralytics/cfg/models/rt-detr/rtdetr-AKConv.yaml 使用[AKConv 2023](https://github.com/CV-ZhangXin/AKConv)改进resnet18-backbone中的BasicBlock. 9. ultralytics/cfg/models/rt-detr/rtdetr-RFAConv.yaml 使用[RFAConv 2023](https://github.com/Liuchen1997/RFAConv)改进resnet18-backbone中的BasicBlock. 10. ultralytics/cfg/models/rt-detr/rtdetr-RFCAConv.yaml 使用[RFCAConv 2023](https://github.com/Liuchen1997/RFAConv)改进resnet18-backbone中的BasicBlock. 11. ultralytics/cfg/models/rt-detr/rtdetr-RFCBAMConv.yaml 使用[RFCBAMConv 2023](https://github.com/Liuchen1997/RFAConv)改进resnet18-backbone中的BasicBlock. 12. ultralytics/cfg/models/rt-detr/rtdetr-Conv3XC.yaml 使用[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的Conv3XC改进resnet18-backbone中的BasicBlock. 13. ultralytics/cfg/models/rt-detr/rtdetr-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock改进resnet18-backbone中的BasicBlock. 14. ultralytics/cfg/models/rt-detr/rtdetr-DBB.yaml 使用[DiverseBranchBlock CVPR2021](https://github.com/DingXiaoH/DiverseBranchBlock)改进resnet18-backbone中的BasicBlock. 15. ultralytics/cfg/models/rt-detr/rtdetr-DualConv.yaml 使用[DualConv](https://github.com/ChipsGuardian/DualConv)改进resnet18-backbone中的BasicBlock. 16. ultralytics/cfg/models/rt-detr/rtdetr-AggregatedAtt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)中的聚合感知注意力改进resnet18中的BasicBlock.(百度云视频-20240106更新说明) 17. ultralytics/cfg/models/rt-detr/rtdetr-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)改进resnet18中的BasicBlock. 18. ultralytics/cfg/models/rt-detr/rtdetr-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)改进resnet18中的BasicBlock. 19. ultralytics/cfg/models/rt-detr/rtdetr-VSS.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)改进resnet18-backbone中的BasicBlock. 20. ultralytics/cfg/models/rt-detr/rtdetr-ContextGuided.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided和Light-weight Context Guided DownSample改进rtdetr-r18. 21. ultralytics/cfg/models/rt-detr/rtdetr-fadc.yaml 使用[CVPR2024 Frequency-Adaptive Dilated Convolution](https://github.com/Linwei-Chen/FADC)改进resnet18-basicblock. 22. ultralytics/cfg/models/rt-detr/rtdetr-Star.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock改进resnet18-basicblock. 23. ultralytics/cfg/models/rt-detr/rtdetr-KAN.yaml 使用[Pytorch-Conv-KAN](https://github.com/IvanDrokin/torch-conv-kan)的KAN卷积算子改进resnet18-basicblock. 目前支持: 1. FastKANConv2DLayer 2. KANConv2DLayer 3. KALNConv2DLayer 4. KACNConv2DLayer 5. KAGNConv2DLayer 24. ultralytics/cfg/models/rt-detr/rtdetr-DEConv.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的detail-enhanced convolution改进resnet18-basicblock. 关于DEConv在运行的时候重参数化后比重参数化前的计算量还要大的问题:是因为重参数化前thop库其计算不准的问题,看重参数化后的参数即可. 25. ultralytics/cfg/models/rt-detr/rtdetr-WTConv.yaml 使用[ECCV2024 Wavelet Convolutions for Large Receptive Fields](https://github.com/BGU-CS-VIL/WTConv)中的WTConv改进BasicBlock. 26. ultralytics/cfg/models/rt-detr/rtdetr-WDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的WDBB改进BasicBlock. 27. ultralytics/cfg/models/rt-detr/rtdetr-DeepDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的DeepDBB改进BasicBlock. 28. ultralytics/cfg/models/rt-detr/rtdetr-GCConvC3.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进RepC3. ### 上下采样算子系列 1. ultralytics/cfg/models/rt-detr/rtdetr-DySample.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进CCFM中的上采样. 2. ultralytics/cfg/models/rt-detr/rtdetr-CARAFE.yaml 使用[ICCV2019 CARAFE](https://arxiv.org/abs/1905.02188)改进CCFM中的上采样. 3. ultralytics/cfg/models/rt-detr/rtdetr-HWD.yaml 使用[Haar wavelet downsampling](https://www.sciencedirect.com/science/article/abs/pii/S0031320323005174)改进CCFM的下采样. 4. ultralytics/cfg/models/rt-detr/rtdetr-ContextGuidedDown.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided DownSample改进rtdetr-r18. 5. ultralytics/cfg/models/rt-detr/rtdetr-SRFD.yaml 使用[A Robust Feature Downsampling Module for Remote Sensing Visual Tasks](https://ieeexplore.ieee.org/document/10142024)改进rtdetr的下采样. 6. ultralytics/cfg/models/rt-detr/rtdetr-WaveletPool.yaml 使用[Wavelet Pooling](https://openreview.net/forum?id=rkhlb8lCZ)改进RTDETR的上采样和下采样。 7. ultralytics/cfg/models/rt-detr/rtdetr-LDConv.yaml 使用[LDConv](https://github.com/CV-ZhangXin/LDConv/tree/main)改进下采样. 8. ultralytics/cfg/models/rt-detr/rtdetr-PSConv.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Pinwheel-shaped Convolution改进rtdetr. 9. ultralytics/cfg/models/rt-detr/rtdetr-EUCB.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB改进rtdetr-r18的上采样. 10. ultralytics/cfg/models/rt-detr/rtdetr-LoGStem.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LoGStem改进Stem. 11. ultralytics/cfg/models/rt-detr/rtdetr-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进Conv. 12. ultralytics/cfg/models/rt-detr/rtdetr-wConv.yaml 使用[weightedConvolution2.0](https://github.com/cammarasana123/weightedConvolution2.0)中的wConv2d改进rtdetr. 13. ultralytics/cfg/models/rt-detr/rtdetr-Converse2D.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的Converse2D改进neck中的上采样. 14. ultralytics/cfg/models/rt-detr/rtdetr-RepStem.yaml 使用[ICCV2023 FastVit](https://arxiv.org/pdf/2303.14189)中的RepStem改进rtdetr下采样. 15. ultralytics/cfg/models/rt-detr/rtdetr-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进下采样. 16. ultralytics/cfg/models/rt-detr/rtdetr-FSConv.yaml 使用[TGRS2025 Think Locally and Act Globally](https://ieeexplore.ieee.org/document/11175146)中的FSConv改进下采样. ### RT-DETR-L改进系列 1. ultralytics/cfg/models/rt-detr/rtdetr-l-GhostHGNetV2.yaml 使用GhostConv改进HGNetV2.(详细介绍请看百度云视频-20231109更新说明) 2. ultralytics/cfg/models/rt-detr/rtdetr-l-RepHGNetV2.yaml 使用RepConv改进HGNetV2.(详细介绍请看百度云视频-20231109更新说明) 3. ultralytics/cfg/models/rt-detr/rtdetr-l-attention.yaml 添加注意力模块到HGBlock中.(手把手教程请看百度云视频-手把手添加注意力教程) ### RT-DETR-Mamba 集成Mamba-YOLO,并把head改为RTDETR-Head.(需要编译,请看百度云视频) ultralytics/cfg/models/rt-detr/rtdetr-mamba-T.yaml ultralytics/cfg/models/rt-detr/rtdetr-mamba-B.yaml ultralytics/cfg/models/rt-detr/rtdetr-mamba-L.yaml ### 注意力系列 1. EMA 2. SimAM 3. SpatialGroupEnhance 4. BiLevelRoutingAttention, BiLevelRoutingAttention_nchw 5. TripletAttention 6. CoordAtt 7. CBAM 8. BAMBlock 9. EfficientAttention(CloFormer中的注意力) 10. LSKBlock 11. SEAttention 12. CPCA 13. deformable_LKA 14. EffectiveSEModule 15. LSKA 16. SegNext_Attention 17. DAttention(Vision Transformer with Deformable Attention CVPR2022) 18. FocusedLinearAttention(ICCV2023) 19. MLCA 20. TransNeXt_AggregatedAttention 21. HiLo 22. LocalWindowAttention(EfficientViT中的CascadedGroupAttention注意力) 23. Efficient Local Attention 24. CAA(CVPR2024 PKINet中的注意力) 25. CAFM ### IoU系列 1. IoU,GIoU,DIoU,CIoU,EIoU,SIoU(百度云视频-20231125更新说明) 2. MPDIoU[论文链接](https://arxiv.org/pdf/2307.07662.pdf)(百度云视频-20231125更新说明) 3. Inner-IoU,Inner-GIoU,Inner-DIoU,Inner-CIoU,Inner-EIoU,Inner-SIoU[论文链接](https://arxiv.org/abs/2311.02877)(百度云视频-20231125更新说明) 4. Inner-MPDIoU(利用Inner-Iou与MPDIou进行二次创新)(百度云视频-20231125更新说明) 5. Normalized Gaussian Wasserstein Distance.[论文链接](https://arxiv.org/abs/2110.13389)(百度云视频-20231125更新说明) 6. Shape-IoU,Inner-Shape-IoU[论文链接](https://arxiv.org/abs/2110.13389)(百度云视频-20240106更新说明) 7. SlideLoss,EMASlideLoss[创新思路](https://www.bilibili.com/video/BV1W14y1i79U/?vd_source=c8452371e7ca510979593165c8d7ac27).[Yolo-Face V2](https://github.com/Krasjet-Yu/YOLO-FaceV2/blob/master/utils/loss.py)(百度云视频-20240113更新说明) 8. Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU)(百度云视频-20240113更新说明) 9. Inner-Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU)(百度云视频-20240113更新说明) 10. Focaler-IoU,Focaler-GIoU,Focaler-DIoU,Focaler-CIoU,Focaler-EIoU,Focaler-SIoU,Focaler-Shape-IoU,Focaler-MPDIoU[论文链接](https://arxiv.org/abs/2401.10525)(百度云视频-20240128更新说明) 11. Focaler-Wise-IoU(v1,v2,v3)(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU)[论文链接](https://arxiv.org/abs/2401.10525)(百度云视频-20240128更新说明) 12. Powerful-IoU,Powerful-IoUV2,Inner-Powerful-IoU,Inner-Powerful-IoUV2,Focaler-Powerful-IoU,Focaler-Powerful-IoUV2,Wise-Powerful-IoU(v1,v2,v3),Wise-Powerful-IoUV2(v1,v2,v3)[论文链接](https://www.sciencedirect.com/science/article/abs/pii/S0893608023006640) 13. SlideVarifocalLoss,EMASlideVarifocalLoss[创新思路](https://www.bilibili.com/video/BV1W14y1i79U/?vd_source=c8452371e7ca510979593165c8d7ac27).[Yolo-Face V2](https://github.com/Krasjet-Yu/YOLO-FaceV2/blob/master/utils/loss.py)(百度云视频-20240302更新说明) 14. CVPR2025-DEIM-MAL.(百度云视频-20240315更新说明) 15. Gaussian Combined Distance[论文链接](https://arxiv.org/pdf/2510.27649)(百度云视频-20251122更新说明) ### 以Yolov8为基准模型的改进方案 1. ultralytics/cfg/models/yolo-detr/yolov8-detr.yaml 使用RT-DETR中的TransformerDecoderHead改进yolov8. 2. ultralytics/cfg/models/yolo-detr/yolov8-detr-DWR.yaml 使用RT-DETR中的TransformerDecoderHead和[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)模块改进yolov8. 3. ultralytics/cfg/models/yolo-detr/yolov8-detr-fasternet.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)改进yolov8.(支持替换其他主干,请看百度云视频-替换主干示例教程) 4. ultralytics/cfg/models/yolo-detr/yolov8-detr-AIFI-LPE.yaml 使用RT-DETR中的TransformerDecoderHead和LearnedPositionalEncoding改进yolov8.(详细介绍请看百度云视频-20231119更新说明) 5. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DCNV2.yaml 使用RT-DETR中的TransformerDecoderHead和可变形卷积DCNV2改进yolov8. 6. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DCNV3.yaml 使用RT-DETR中的TransformerDecoderHead和可变形卷积[DCNV3 CVPR2023](https://github.com/OpenGVLab/InternImage)改进yolov8.(安装教程请看百度云视频-20231119更新说明) 7. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DCNV2-Dynamic.yaml 使用RT-DETR中的TransformerDecoderHead和自研可变形卷积DCNV2-Dynamic改进yolov8.(详细介绍请看百度云视频-MPCA与DCNV2_Dynamic的说明) 8. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Ortho.yaml 使用RT-DETR中的TransformerDecoderHead和[OrthoNets](https://github.com/hady1011/OrthoNets/tree/main)中的正交通道注意力改进yolov8.(详细介绍请看百度云视频-20231119更新说明) 9. ultralytics/cfg/models/yolo-detr/yolov8-detr-attention.yaml 添加注意力到基于RTDETR-Head中的yolov8中.(手把手教程请看百度云视频-手把手添加注意力教程) 10. ultralytics/cfg/models/yolo-detr/yolov8-detr-p2.yaml 添加小目标检测头P2到TransformerDecoderHead中. 11. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DySnake.yaml [DySnakeConv](https://github.com/YaoleiQi/DSCNet)与C2f融合. 12. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Faster.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block改进yolov8. 13. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Faster-Rep.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中与[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv二次创新后的Faster-Block-Rep改进yolov8. 14. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Faster-EMA.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中与[EMA ICASSP2023](https://arxiv.org/abs/2305.13563v1)二次创新后的Faster-Block-EMA的Faster-Block-EMA改进yolov8. 15. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Faster-Rep-EMA.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中与[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv、[EMA ICASSP2023](https://arxiv.org/abs/2305.13563v1)二次创新后的Faster-Block改进yolov8. 16. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-AKConv.yaml 使用RT-DETR中的TransformerDecoderHead和[AKConv 2023](https://github.com/CV-ZhangXin/AKConv)改进yolov8. 17. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-RFAConv.yaml 使用RT-DETR中的TransformerDecoderHead和[RFAConv 2023](https://github.com/Liuchen1997/RFAConv)改进yolov8. 18. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-RFAConv.yaml 使用RT-DETR中的TransformerDecoderHead和[RFCAConv 2023](https://github.com/Liuchen1997/RFAConv)改进yolov8. 19. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-RFAConv.yaml 使用RT-DETR中的TransformerDecoderHead和[RFCBAMConv 2023](https://github.com/Liuchen1997/RFAConv)改进yolov8. 20. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Conv3XC.yaml 使用RT-DETR中的TransformerDecoderHead和[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的Conv3XC改进yolov8. 21. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-SPAB.yaml 使用RT-DETR中的TransformerDecoderHead和[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的SPAB改进yolov8. 22. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DRB.yaml 使用RT-DETR中的TransformerDecoderHead和[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock改进yolov8. 23. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-UniRepLKNetBlock.yaml 使用RT-DETR中的TransformerDecoderHead和[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的UniRepLKNetBlock改进yolov8. 24. ultralytics/cfg/models/yolo-detr/yolov8-detr-DWR-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)进行二次创新改进yolov8. 25. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DBB.yaml 使用RT-DETR中的TransformerDecoderHead和[DiverseBranchBlock CVPR2021](https://github.com/DingXiaoH/DiverseBranchBlock)改进yolov8. 26. ultralytics/cfg/models/yolo-detr/yolov8-detr-CSP-EDLAN.yaml 使用RT-DETR中的TransformerDecoderHead和[DualConv](https://github.com/ChipsGuardian/DualConv)打造CSP Efficient Dual Layer Aggregation Networks改进yolov8. 27. ultralytics/cfg/models/yolo-detr/yolov8-detr-ASF.yaml 使用RT-DETR中的TransformerDecoderHead和[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion改进yolov8. 28. ultralytics/cfg/models/yolo-detr/yolov8-detr-ASF-P2.yaml 在ultralytics/cfg/models/yolo-detr/yolov8-detr-ASF.yaml的基础上进行二次创新,引入P2检测层并对网络结构进行优化. 29. ultralytics/cfg/models/yolo-detr/yolov8-detr-slimneck.yaml 使用RT-DETR中的TransformerDecoderHead和[SlimNeck](https://github.com/AlanLi1997/slim-neck-by-gsconv)中VoVGSCSP\VoVGSCSPC和GSConv改进yolov8的neck. 30. ultralytics/cfg/models/yolo-detr/yolov8-detr-slimneck-asf.yaml 在ultralytics/cfg/models/yolo-detr/yolov8-detr-slimneck.yaml使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行二次创新. 31. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-AggregatedAtt.yaml 使用RT-DETR中的TransformerDecoderHead和[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)中的聚合感知注意力改进C2f.(百度云视频-20240106更新说明) 32. ultralytics/cfg/models/yolo-detr/yolov8-detr-SDI.yaml 使用RT-DETR中的TransformerDecoderHead和[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对yolov8中的feature fusion进行改进. 33. ultralytics/cfg/models/yolo-detr/yolov8-detr-goldyolo.yaml 利用RT-DETR中的TransformerDecoderHead和华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块. 34. ultralytics/cfg/models/yolo-detr/yolov8-detr-goldyolo-asf.yaml 利用RT-DETR中的TransformerDecoderHead和华为2023最新GOLD-YOLO中的Gatherand-Distribute和[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行改进特征融合模块. 35. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)改进C2f. 36. ultralytics/cfg/models/yolo-detr/yolov8-detr-HSFPN.yaml 利用RT-DETR中的TransformerDecoderHead和使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进YOLOV8中的PAN. 37. ultralytics/cfg/models/yolo-detr/yolov8-detr-HSPAN.yaml 利用RT-DETR中的TransformerDecoderHead和对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进YOLOV8中的PAN. 38. ultralytics/cfg/models/yolo-detr/yolov8-detr-Dysample.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进yolov8-detr neck中的上采样. 39. ultralytics/cfg/models/yolo-detr/yolov8-detr-CARAFE.yaml 使用[ICCV2019 CARAFE](https://arxiv.org/abs/1905.02188)改进yolov8-detr neck中的上采样. 40. ultralytics/cfg/models/yolo-detr/yolov8-detr-HWD.yaml 使用[Haar wavelet downsampling](https://www.sciencedirect.com/science/article/abs/pii/S0031320323005174)改进yolov8-detr neck的下采样. 41. ultralytics/cfg/models/yolo-detr/yolov8-detr-ASF-Dynamic.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion的上采样模块得到Dynamic Sample Attentional Scale Sequence Fusion改进yolov8-detr中的neck. 42. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)改进yolov8-detr中的C2f. 43. ultralytics/cfg/models/yolo-detr/yolov8-detr-iRMB-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进yolov8-detr中的C2f. 44. ultralytics/cfg/models/yolo-detr/yolov8-detr-iRMB-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进yolov8-detr中的C2f. 45. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-VSS.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)对C2f中的BottleNeck进行改进,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 46. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-LVMB.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)与Cross Stage Partial进行结合,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 47. ultralytics/cfg/models/yolo-detr/yolov8-detr-RepNCSPELAN.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行改进yolov8-detr. 48. ultralytics/cfg/models/yolo-detr/yolov8-detr-bifpn.yaml 添加BIFPN到yolov8中. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode block模块选择,具体可看对应百度云视频-20240302更新公告. 3. head_channel BIFPN中的通道数,默认设置为256. 49. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-ContextGuided.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided和Light-weight Context Guided DownSample改进yolov8-detr. 50. ultralytics/cfg/models/yolo-detr/yolov8-detr-PACAPN.yaml 自研结构, Parallel Atrous Convolution Attention Pyramid Network, PAC-APN 51. ultralytics/cfg/models/yolo-detr/yolov8-detr-DGCST.yaml 使用[Lightweight Object Detection](https://arxiv.org/abs/2403.01736)中的Dynamic Group Convolution Shuffle Transformer改进yolov8-detr. 52. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-RetBlock.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)中的RetBlock改进C2f. 53. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-PKI.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的PKIModule和CAA模块改进C2f. 54. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-fadc.yaml 使用[CVPR2024 Frequency-Adaptive Dilated Convolution](https://github.com/Linwei-Chen/FADC)改进C2f. 55. ultralytics/cfg/models/yolo-detr/yolov8-detr-FDPN.yaml 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 1. 通过定制的特征聚焦模块与特征扩散机制,能让每个尺度的特征都具有详细的上下文信息,更有利于后续目标的检测与分类。 2. 定制的特征聚焦模块可以接受三个尺度的输入,其内部包含一个Inception-Style的模块,其利用一组并行深度卷积来捕获丰富的跨多个尺度的信息。 3. 通过扩散机制使具有丰富的上下文信息的特征进行扩散到各个检测尺度. 56. ultralytics/cfg/models/yolo-detr/yolov8-detr-FDPN-DASI.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Dimension-Aware Selective Integration Module对自研的Focusing Diffusion Pyramid Network再次创新. 57. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-PPA.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Parallelized Patch-Aware Attention Module改进C2f. 58. ultralytics/cfg/models/yolo-detr/yolov8-detr-SRFD.yaml 使用[A Robust Feature Downsampling Module for Remote Sensing Visual Tasks](https://ieeexplore.ieee.org/document/10142024)改进yolov8的下采样. 59. ultralytics/cfg/models/yolo-detr/yolov8-detr-CSFCN.yaml 使用[Context and Spatial Feature Calibration for Real-Time Semantic Segmentation](https://github.com/kaigelee/CSFCN/tree/main)中的Context and Spatial Feature Calibration模块改进yolov8. 60. ultralytics/cfg/models/yolo-detr/yolov8-detr-CGAFusion.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的content-guided attention fusion改进yolov8-neck. 61. ultralytics/cfg/models/yolo-detr/yolov8-detr-CAFMFusion.yaml 利用具有[HCANet](https://github.com/summitgao/HCANet)中的CAFM,其具有获取全局和局部信息的注意力机制进行二次改进content-guided attention fusion. 62. ultralytics/cfg/models/yolo-detr/yolov8-detr-RGCSPELAN.yaml 自研RepGhostCSPELAN. 1. 参考GhostNet中的思想(主流CNN计算的中间特征映射存在广泛的冗余),采用廉价的操作生成一部分冗余特征图,以此来降低计算量和参数量。 2. 舍弃yolov5与yolov8中常用的BottleNeck,为了弥补舍弃残差块所带来的性能损失,在梯度流通分支上使用RepConv,以此来增强特征提取和梯度流通的能力,并且RepConv可以在推理的时候进行融合,一举两得。 3. 可以通过缩放因子控制RGCSPELAN的大小,使其可以兼顾小模型和大模型。 63. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Faster-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU对CVPR2023中的FasterNet进行二次创新. 64. ultralytics/cfg/models/yolo-detr/yolov8-detr-SDFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的superficial detail fusion module改进yolov8-neck. 65. ultralytics/cfg/models/yolo-detr/yolov8-detr-PSFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的profound semantic fusion module改进yolov8-neck. 66. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Star.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock改进C2f. 67. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-Star-CAA.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock和[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA改进C2f. 68. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-KAN.yaml 使用[Pytorch-Conv-KAN](https://github.com/IvanDrokin/torch-conv-kan)的KAN卷积算子改进C2f. 目前支持: 1. FastKANConv2DLayer 2. KANConv2DLayer 3. KALNConv2DLayer 4. KACNConv2DLayer 5. KAGNConv2DLayer 69. ultralytics/cfg/models/yolo-detr/yolov8-detr-ContextGuideFPN.yaml Context Guide Fusion Module(CGFM)是一个创新的特征融合模块,旨在改进YOLOv8中的特征金字塔网络(FPN)。该模块的设计考虑了多尺度特征融合过程中上下文信息的引导和自适应调整。 1. 上下文信息的有效融合:通过SE注意力机制,模块能够在特征融合过程中捕捉并利用重要的上下文信息,从而增强特征表示的有效性,并有效引导模型学习检测目标的信息,从而提高模型的检测精度。 2. 特征增强:通过权重化的特征重组操作,模块能够增强重要特征,同时抑制不重要特征,提升特征图的判别能力。 3. 简单高效:模块结构相对简单,不会引入过多的计算开销,适合在实时目标检测任务中应用。 这期视频讲解在B站:https://www.bilibili.com/video/BV1Vx4y1n7hZ/ 70. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-DEConv.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的detail-enhanced convolution改进C2f. 关于DEConv在运行的时候重参数化后比重参数化前的计算量还要大的问题:是因为重参数化前thop库其计算不准的问题,看重参数化后的参数即可. 71. ultralytics/cfg/models/yolo-detr/yolov8-detr-C2f-SMPCGLU.yaml Self-moving Point Convolutional GLU模型改进C2f. SMP来源于[CVPR2023-SMPConv](https://github.com/sangnekim/SMPConv),Convolutional GLU来源于[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt). 1. 普通的卷积在面对数据中的多样性和复杂性时,可能无法捕捉到有效的特征,因此我们采用了SMPConv,其具备最新的自适应点移动机制,从而更好地捕捉局部特征,提高特征提取的灵活性和准确性。 2. 在SMPConv后添加CGLU,Convolutional GLU 结合了卷积和门控机制,能够选择性地通过信息通道,提高了特征提取的有效性和灵活性。 ### 以Yolov5为基准模型的改进方案 1. ultralytics/cfg/models/yolo-detr/yolov5-detr.yaml 使用RT-DETR中的TransformerDecoderHead改进yolov5. 2. ultralytics/cfg/models/yolo-detr/yolov5-detr-DWR.yaml 使用RT-DETR中的TransformerDecoderHead和[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)模块改进yolov5. 3. ultralytics/cfg/models/yolo-detr/yolov5-detr-fasternet.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)改进yolov5.(支持替换其他主干,请看百度云视频-替换主干示例教程) 4. ultralytics/cfg/models/yolo-detr/yolov5-detr-AIFI-LPE.yaml 使用RT-DETR中的TransformerDecoderHead和LearnedPositionalEncoding改进yolov5.(详细介绍请看百度云视频-20231119更新说明) 5. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DCNV2.yaml 使用RT-DETR中的TransformerDecoderHead和可变形卷积DCNV2改进yolov5. 6. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DCNV3.yaml 使用RT-DETR中的TransformerDecoderHead和可变形卷积[DCNV3 CVPR2023](https://github.com/OpenGVLab/InternImage)改进yolov5.(安装教程请看百度云视频-20231119更新说明) 7. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DCNV2-Dynamic.yaml 使用RT-DETR中的TransformerDecoderHead和自研可变形卷积DCNV2-Dynamic改进yolov5.(详细介绍请看百度云视频-MPCA与DCNV2_Dynamic的说明) 8. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Ortho.yaml(详细介绍请看百度云视频-20231119更新说明) 使用RT-DETR中的TransformerDecoderHead和[OrthoNets](https://github.com/hady1011/OrthoNets/tree/main)中的正交通道注意力改进yolov5. 9. ultralytics/cfg/models/yolo-detr/yolov5-detr-attention.yaml 添加注意力到基于RTDETR-Head中的yolov5中.(手把手教程请看百度云视频-手把手添加注意力教程) 10. ultralytics/cfg/models/yolo-detr/yolov5-detr-p2.yaml 添加小目标检测头P2到TransformerDecoderHead中. 11. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DySnake.yaml [DySnakeConv](https://github.com/YaoleiQi/DSCNet)与C3融合. 12. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Faster.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block改进yolov5. 13. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Faster-Rep.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中与[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv二次创新后的Faster-Block-Rep改进yolov5. 14. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Faster-EMA.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中与[EMA ICASSP2023](https://arxiv.org/abs/2305.13563v1)二次创新后的Faster-Block-EMA的Faster-Block-EMA改进yolov5. 15. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Faster-Rep-EMA.yaml 使用RT-DETR中的TransformerDecoderHead和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中与[RepVGG CVPR2021](https://github.com/DingXiaoH/RepVGG)中的RepConv、[EMA ICASSP2023](https://arxiv.org/abs/2305.13563v1)二次创新后的Faster-Block改进yolov5. 16. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-AKConv.yaml 使用RT-DETR中的TransformerDecoderHead和[AKConv 2023](https://github.com/CV-ZhangXin/AKConv)改进yolov5. 17. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-RFAConv.yaml 使用RT-DETR中的TransformerDecoderHead和[RFAConv 2023](https://github.com/Liuchen1997/RFAConv)改进yolov5. 18. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-RFAConv.yaml 使用RT-DETR中的TransformerDecoderHead和[RFCAConv 2023](https://github.com/Liuchen1997/RFAConv)改进yolov5. 19. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-RFAConv.yaml 使用RT-DETR中的TransformerDecoderHead和[RFCBAMConv 2023](https://github.com/Liuchen1997/RFAConv)改进yolov5. 20. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Conv3XC.yaml 使用RT-DETR中的TransformerDecoderHead和[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的Conv3XC改进yolov5. 21. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-SPAB.yaml 使用RT-DETR中的TransformerDecoderHead和[Swift Parameter-free Attention Network](https://github.com/hongyuanyu/SPAN/tree/main)中的SPAB改进yolov5. 22. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DRB.yaml 使用RT-DETR中的TransformerDecoderHead和[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock改进改进yolov5. 23. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-UniRepLKNetBlock.yaml 使用RT-DETR中的TransformerDecoderHead和[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的UniRepLKNetBlock改进改进yolov5. 24. ultralytics/cfg/models/yolo-detr/yolov5-detr-DWR-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)进行二次创新改进yolov5. 25. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DBB.yaml 使用RT-DETR中的TransformerDecoderHead和[DiverseBranchBlock CVPR2021](https://github.com/DingXiaoH/DiverseBranchBlock)改进yolov5. 26. ultralytics/cfg/models/yolo-detr/yolov5-detr-CSP-EDLAN.yaml 使用RT-DETR中的TransformerDecoderHead和[DualConv](https://github.com/ChipsGuardian/DualConv)打造CSP Efficient Dual Layer Aggregation Networks改进yolov5. 27. ultralytics/cfg/models/yolo-detr/yolov5-detr-ASF.yaml 使用RT-DETR中的TransformerDecoderHead和[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion改进yolov5. 28. ultralytics/cfg/models/yolo-detr/yolov5-detr-ASF-P2.yaml 在ultralytics/cfg/models/yolo-detr/yolov5-detr-ASF.yaml的基础上进行二次创新,引入P2检测层并对网络结构进行优化. 29. ultralytics/cfg/models/yolo-detr/yolov5-detr-slimneck.yaml 使用RT-DETR中的TransformerDecoderHead和[SlimNeck](https://github.com/AlanLi1997/slim-neck-by-gsconv)中VoVGSCSP\VoVGSCSPC和GSConv改进yolov5的neck. 30. ultralytics/cfg/models/yolo-detr/yolov5-detr-slimneck-asf.yaml 在ultralytics/cfg/models/yolo-detr/yolov5-detr-slimneck.yaml使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行二次创新. 31. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-AggregatedAtt.yaml 使用RT-DETR中的TransformerDecoderHead和[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)中的聚合感知注意力改进C3.(百度云视频-20240106更新说明) 32. ultralytics/cfg/models/yolo-detr/yolov5-detr-SDI.yaml 使用RT-DETR中的TransformerDecoderHead和[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对yolov5中的feature fusion进行改进. 33. ultralytics/cfg/models/yolo-detr/yolov5-detr-goldyolo.yaml 利用RT-DETR中的TransformerDecoderHead和华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块. 34. ultralytics/cfg/models/yolo-detr/yolov5-detr-goldyolo-asf.yaml 利用RT-DETR中的TransformerDecoderHead和华为2023最新GOLD-YOLO中的Gatherand-Distribute和[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行改进特征融合模块. 35. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)改进C3. 36. ultralytics/cfg/models/yolo-detr/yolov5-detr-HSFPN.yaml 利用RT-DETR中的TransformerDecoderHead和使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进YOLOV5中的PAN. 37. ultralytics/cfg/models/yolo-detr/yolov5-detr-HSPAN.yaml 利用RT-DETR中的TransformerDecoderHead和对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进YOLOV5中的PAN. 38. ultralytics/cfg/models/yolo-detr/yolov8-detr-Dysample.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进yolov8-detr neck中的上采样. 39. ultralytics/cfg/models/yolo-detr/yolov8-detr-CARAFE.yaml 使用[ICCV2019 CARAFE](https://arxiv.org/abs/1905.02188)改进yolov8-detr neck中的上采样. 40. ultralytics/cfg/models/yolo-detr/yolov8-detr-HWD.yaml 使用[Haar wavelet downsampling](https://www.sciencedirect.com/science/article/abs/pii/S0031320323005174)改进yolov8-detr neck的下采样. 41. ultralytics/cfg/models/yolo-detr/yolov5-detr-ASF-Dynamic.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion的上采样模块得到Dynamic Sample Attentional Scale Sequence Fusion改进yolov5-detr中的neck. 42. ultralytics/cfg/models/yolo-detr/yolov5-detr-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)改进yolov5-detr中的C3. 43. ultralytics/cfg/models/yolo-detr/yolov5-detr-iRMB-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进yolov5-detr中的C2f. 44. ultralytics/cfg/models/yolo-detr/yolov5-detr-iRMB-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进yolov5-detr中的C2f. 45. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-VSS.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)对C3中的BottleNeck进行改进,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 46. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-LVMB.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)与Cross Stage Partial进行结合,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 47. ultralytics/cfg/models/yolo-detr/yolov5-detr-RepNCSPELAN.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行改进yolov5-detr. 48. ultralytics/cfg/models/yolo-detr/yolov5-detr-bifpn.yaml 添加BIFPN到yolov8中. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode block模块选择,具体可看对应百度云视频-20240302更新公告. 3. head_channel BIFPN中的通道数,默认设置为256. 49. ultralytics/cfg/models/yolo-detr/yolov5-detr-C2f-ContextGuided.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided和Light-weight Context Guided DownSample改进yolov5-detr. 50. ultralytics/cfg/models/yolo-detr/yolov5-detr-PACAPN.yaml 自研结构, Parallel Atrous Convolution Attention Pyramid Network, PAC-APN 51. ultralytics/cfg/models/yolo-detr/yolov5-detr-DGCST.yaml 使用[Lightweight Object Detection](https://arxiv.org/abs/2403.01736)中的Dynamic Group Convolution Shuffle Transformer改进yolov5-detr. 52. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-RetBlock.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)中的RetBlock改进C3. 53. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-PKI.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的PKIModule和CAA模块改进C3. 54. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-fadc.yaml 使用[CVPR2024 Frequency-Adaptive Dilated Convolution](https://github.com/Linwei-Chen/FADC)改进C3. 55. ultralytics/cfg/models/yolo-detr/yolov5-detr-FDPN.yaml 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 1. 通过定制的特征聚焦模块与特征扩散机制,能让每个尺度的特征都具有详细的上下文信息,更有利于后续目标的检测与分类。 2. 定制的特征聚焦模块可以接受三个尺度的输入,其内部包含一个Inception-Style的模块,其利用一组并行深度卷积来捕获丰富的跨多个尺度的信息。 3. 通过扩散机制使具有丰富的上下文信息的特征进行扩散到各个检测尺度. 56. ultralytics/cfg/models/yolo-detr/yolov5-detr-FDPN-DASI.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Dimension-Aware Selective Integration Module对自研的Focusing Diffusion Pyramid Network再次创新. 57. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-PPA.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Parallelized Patch-Aware Attention Module改进C3. 58. ultralytics/cfg/models/yolo-detr/yolov5-detr-SRFD.yaml 使用[A Robust Feature Downsampling Module for Remote Sensing Visual Tasks](https://ieeexplore.ieee.org/document/10142024)改进yolov5的下采样. 59. ultralytics/cfg/models/yolo-detr/yolov5-detr-CSFCN.yaml 使用[Context and Spatial Feature Calibration for Real-Time Semantic Segmentation](https://github.com/kaigelee/CSFCN/tree/main)中的Context and Spatial Feature Calibration模块改进yolov5. 60. ultralytics/cfg/models/yolo-detr/yolov5-detr-CGAFusion.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的content-guided attention fusion改进yolov5-neck. 61. ultralytics/cfg/models/yolo-detr/yolov5-detr-CAFMFusion.yaml 利用具有[HCANet](https://github.com/summitgao/HCANet)中的CAFM,其具有获取全局和局部信息的注意力机制进行二次改进content-guided attention fusion. 62. ultralytics/cfg/models/yolo-detr/yolov5-detr-RGCSPELAN.yaml 自研RepGhostCSPELAN. 1. 参考GhostNet中的思想(主流CNN计算的中间特征映射存在广泛的冗余),采用廉价的操作生成一部分冗余特征图,以此来降低计算量和参数量。 2. 舍弃yolov5与yolov8中常用的BottleNeck,为了弥补舍弃残差块所带来的性能损失,在梯度流通分支上使用RepConv,以此来增强特征提取和梯度流通的能力,并且RepConv可以在推理的时候进行融合,一举两得。 3. 可以通过缩放因子控制RGCSPELAN的大小,使其可以兼顾小模型和大模型。 63. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Faster-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU对CVPR2023中的FasterNet进行二次创新. 64. ultralytics/cfg/models/yolo-detr/yolov5-detr-SDFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的superficial detail fusion module改进yolov5-neck. 65. ultralytics/cfg/models/yolo-detr/yolov5-detr-PSFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的profound semantic fusion module改进yolov5-neck. 66. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Star.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock改进C3. 67. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-Star-CAA.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock和[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA改进C3. 68. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-KAN.yaml 使用[Pytorch-Conv-KAN](https://github.com/IvanDrokin/torch-conv-kan)的KAN卷积算子改进C3. 目前支持: 1. FastKANConv2DLayer 2. KANConv2DLayer 3. KALNConv2DLayer 4. KACNConv2DLayer 5. KAGNConv2DLayer 69. ultralytics/cfg/models/yolo-detr/yolov5-detr-ContextGuideFPN.yaml Context Guide Fusion Module(CGFM)是一个创新的特征融合模块,旨在改进YOLOv8中的特征金字塔网络(FPN)。该模块的设计考虑了多尺度特征融合过程中上下文信息的引导和自适应调整。 1. 上下文信息的有效融合:通过SE注意力机制,模块能够在特征融合过程中捕捉并利用重要的上下文信息,从而增强特征表示的有效性,并有效引导模型学习检测目标的信息,从而提高模型的检测精度。 2. 特征增强:通过权重化的特征重组操作,模块能够增强重要特征,同时抑制不重要特征,提升特征图的判别能力。 3. 简单高效:模块结构相对简单,不会引入过多的计算开销,适合在实时目标检测任务中应用。 这期视频讲解在B站:https://www.bilibili.com/video/BV1Vx4y1n7hZ/ 70. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-DEConv.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的detail-enhanced convolution改进C3. 关于DEConv在运行的时候重参数化后比重参数化前的计算量还要大的问题:是因为重参数化前thop库其计算不准的问题,看重参数化后的参数即可. 71. ultralytics/cfg/models/yolo-detr/yolov5-detr-C3-SMPCGLU.yaml Self-moving Point Convolutional GLU模型改进C3. SMP来源于[CVPR2023-SMPConv](https://github.com/sangnekim/SMPConv),Convolutional GLU来源于[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt). 1. 普通的卷积在面对数据中的多样性和复杂性时,可能无法捕捉到有效的特征,因此我们采用了SMPConv,其具备最新的自适应点移动机制,从而更好地捕捉局部特征,提高特征提取的灵活性和准确性。 2. 在SMPConv后添加CGLU,Convolutional GLU 结合了卷积和门控机制,能够选择性地通过信息通道,提高了特征提取的有效性和灵活性。 # 更新公告 - **20231105-rtdetr-v1.0** 1. 初版项目发布. - **20231109-rtdetr-v1.1** 1. 修复断点训练不能正常使用的bug. 2. 优化get_FPS.py中的模型导入方法. 3. 增加以yolov5和yolov8为基准模型更换为RTDETR的Head,后续也会提供yolov5-detr,yolov8-detr相关的改进. 4. 新增百度云视频-20231109更新说明视频和替换主干说明视频. 5. 新增GhostHGNetV2,RepHGNetV2,详细请看使用教程中的RT-DETR改进方案. 6. 新增使用DWRSeg中的Dilation-wise Residual(DWR)模块,加强从网络高层的可扩展感受野中提取特征,详细请看使用教程中的RT-DETR改进方案. - **20231119-rtdetr-v1.2** 1. 增加DCNV2,DCNV3,DCNV2-Dynamic,并以RTDETR-R18,RTDETR-R50,YOLOV5-Detr,YOLOV8-Detr多个基准模型进行改进,详细请看使用教程中的RT-DETR改进方案. 2. 使用CVPR2022-OrthoNets中的正交通道注意力改进resnet18-backbone中的BasicBlock,resnet50-backbone中的BottleNeck,yolov8-C2f,yolov5-C3,详细请看使用教程中的RT-DETR改进方案. 3. 使用LearnedPositionalEncoding改进AIFI中的位置编码信息生成,详细请看使用教程中的RT-DETR改进方案. 4. 增加EMO模型中的iRMB模块,并使用(EfficientViT-CVPR2023)中的CascadedAttention对其二次创新得到iRMB_Cascaded,详细请看使用教程中的RT-DETR改进方案. 5. 百度云视频增加1119更新说明和手把手添加注意力机制视频教学. 6. 更新使用教程. - **20231126-rtdetr-v1.3** 1. 支持IoU,GIoU,DIoU,CIoU,EIoU,SIoU. 2. 支持MPDIoU,Inner-IoU,Inner-MPDIoU. 3. 支持Normalized Gaussian Wasserstein Distance. 4. 支持小目标检测层P2. 5. 支持DySnakeConv. 6. 新增Pconv,PConv-Rep(二次创新)优化rtdetr-r18与rtdetr-r50. 7. 新增Faster-Block,Faster-Block-Rep(二次创新),Faster-Block-EMA(二次创新),Faster-Block-Rep-EMA(二次创新)优化rtdetr-r18、rtdetr-r50、yolov5-detr、yolov8-retr. 8. 更新使用教程. 9. 百度云视频增加1126更新说明. - **20231202-rtdetr-v1.4** 1. 支持AKConv(具有任意采样形状和任意数目参数的卷积核). 2. 支持RFAConv,RFCAConv,RFCBAMConv(感受野注意力卷积). 3. 支持UniRepLKNet(大核CNNRepLK正统续作). 4. 使用CVPR2022 DAttention改进AIFI. 4. 更新使用教程. 5. 百度云视频增加1202更新说明. 6. 解决训练过程中由于指标出现的nan问题导致best.pt没办法正常保存. - **20231210-rtdetr-v1.5** 1. 支持来自Swift Parameter-free Attention Network中的重参数化Conv3XC模块. 2. 支持UniRepLKNet中的DilatedReparamBlock. 3. 支持UniRepLKNet中的DilatedReparamBlock对DWRSeg中的Dilation-wise Residual(DWR)模块进行二次创新的DWR_DRB. 4. 使用ICCV2023 FLatten Transformer中的FocusedLinearAttention改进AIFI. 5. 更新使用教程. 6. 百度云视频增加1210更新说明. - **20231214-rtdetr-v1.6** 1. 支持DiverseBranchBlock. 2. 利用DualConv打造CSP Efficient Dual Layer Aggregation Networks(仅支持yolov5-detr和yolov8-detr). 3. 使用Swift Parameter-free Attention Network中的重参数化Conv3XC和DiverseBranchBlock改进RepC3. 4. 支持最新的ASF-YOLO中的Attentional Scale Sequence Fusion. 5. 更新使用教程. 6. 百度云视频增加1214更新说明. - **20231223-rtdetr-v1.7** 1. 增加rtdetr-r18-asf-p2.yaml,使用ASF-YOLO中的Attentional Scale Sequence Fusion与Small Object Detection Head进行二次创新. 2. 新增rtdetr-slimneck.yaml和rtdetr-slimneck-ASF.yaml. 3. 新增yolov8-detr-slimneck.yaml,yolov8-detr-slimneck-asf.yaml. 4. 新增yolov5-detr-slimneck.yaml,yolov5-detr-slimneck-asf.yaml. 5. 修正热力图计算中预处理. 6. 更新使用教程. 7. 百度云视频增加1223更新说明. - **20240106-rtdetr-v1.8** 1. 新增Shape-IoU,Inner-Shape-IoU. 2. 新增支持TransNeXt主干和TransNeXt中的聚焦感知注意力机制. 3. 新增U-NetV2中的Semantics and Detail Infusion Module对RTDETR的CCFM进行创新. 4. ASF系列支持attention_add. 5. 更新使用教程. 6. 百度云视频增加20240106更新说明. - **20240113-rtdetr-v1.9** 1. 支持Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 2. 支持Inner-Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 3. 支持SlideLoss,EMASlideLoss(利用Exponential Moving Average优化mean iou,可当自研创新模块). 4. 使用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块. 5. 使用ASF-YOLO中Attentional Scale Sequence Fusion与GOLD-YOLO中的Gatherand-Distribute进行二次创新结合. 6. 修正rtdetr-r34中检测头参数错误的问题,增加rtdetr-r34,rtdetr-r50-m的预训练权重. 7. 更新使用教程. 8. 百度云视频增加20240113更新说明. - **20240120-rtdetr-v1.10** 1. 新增DCNV4. 2. 使用[LITv2](https://github.com/ziplab/LITv2)中具有提取高低频信息的高效注意力对AIFI进行二次改进. 3. 使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进RTDETR中的CCFM和YOLOV5-DETR、YOLOV8-DETR中的Neck. 4. 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进RTDETR中的CCFM和YOLOV5-DETR、YOLOV8-DETR中的Neck. 5. 修复没有使用wiou时候断点续寻的bug. 6. 修复plot_result.py画结果图中乱码的问题. 7. 更新使用教程. 8. 百度云视频增加20240120更新说明. - **20240128-rtdetr-v1.11** 1. 增加CARAFE轻量化上采样算子. 2. 增加DySample(ICCV2023)动态上采样算子. 3. 增加Haar wavelet downsampling下采样算子. 4. 增加Focaler-IoU,Focaler-GIoU,Focaler-DIoU,Focaler-CIoU,Focaler-EIoU,Focaler-SIoU,Focaler-Shape-IoU,Focaler-MPDIoU. 5. 增加Focaler-Wise-IoU(v1,v2,v3)(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 6. 使用DySample(ICCV2023)动态上采样算子对ASF-YOLO中的Attentional Scale Sequence Fusion进行二次创新. 7. 更新使用教程. 8. 百度云视频增加20240128更新说明. - **20240206-rtdetr-v1.12** 1. 新增Shift-ConvNets相关改进内容.(rtdetr-SWC.yaml,rtdetr-R50-SWC.yaml,yolov8-detr-C2f-SWC.yaml,yolov5-detr-C3-SWC.yaml) 2. 使用UniRepLKNet中的DilatedReparamBlock对EMO中的iRMB进行二次创新. 3. 使用Shift-ConvNets中的具有移位操作的卷积对EMO中的iRMB进行二次创新. 4. 更新使用教程. 5. 百度云视频增加20240206更新说明. - **20240219-rtdetr-v1.13** 1. 使用最新的Mamba架构(号称超越Transformer的新架构)改进rtdetr-r18,rtdetr-r50,yolov5-detr,yolov8-detr. 2. 新增Powerful-IoU,Powerful-IoUV2,Inner-Powerful-IoU,Inner-Powerful-IoUV2,Focaler-Powerful-IoU,Focaler-Powerful-IoUV2,Wise-Powerful-IoU(v1,v2,v3),Wise-Powerful-IoUV2(v1,v2,v3)系列. 3. 更新热力图脚本,使用方式可参考最新发的yolov5v7-gradcam的视频. 4. 更新COCO脚本,增加其他指标输出. 5. 更新使用教程. 6. 百度云视频增加20240219更新说明. - **20240225-rtdetr-v1.14** 1. 新增YOLOV9中的RepNCSPELAN模块. 2. 使用DBB,OREPA,DilatedReparamBlock,Conv3XC对YOLOV9中的RepNCSPELAN模块进行二次创新. 3. 更新使用教程. 4. 百度云视频增加20240225更新说明. - **20240302-rtdetr-v1.15** 1. 新增CGNet中的Light-weight Context Guided和Light-weight Context Guided DownSample模块. 2. Neck模块新增BIFPN,并对其进行创新,支持替换不同的block. 3. 为RTDETR定制SlideVarifocalLoss,EMASlideVarifocalLoss. 4. 更新使用教程. 5. 百度云视频增加20240302更新说明. - **20240307-rtdetr-v1.16** 1. 新增自研Neck结构Parallel Atrous Convolution Attention Pyramid Network, PAC-APN.附带模块内结构图 2. 复现Lightweight Object Detection中的Dynamic Group Convolution Shuffle Transformer. 3. 更新使用教程. 4. 百度云视频增加20240307更新说明. - **20240321-rtdetr-v1.17** 1. 新增CVPR2024-RMT主干,并支持RetBlock改进RepC3. 2. 新增2024年新出的Efficient Local Attention,并用其对HSFPN进行二次创新. 3. 使用CVPR2021-CoordAttention对HSFPN进行二次创新. 4. 更新使用教程,增加多个常见疑问解答. 5. 百度云视频增加20240321更新说明. - **20240404-rtdetr-v1.18** 1. 新增CVPR2024 PKINet主干. 2. 新增CVPR2024 PKINet中的PKIModule和CAA模块,提出C2f-PKI. 3. 使用CVPR2024 PKINet中的Context Anchor Attention改进RepNCSPELAN、HSFPN. 4. 新增CVPR2024 Frequency-Adaptive Dilated Convolution. 5. 增加有效感受野可视化脚本. 6. 更新使用教程 7. 百度云视频增加20240404更新说明. - **20240412-rtdetr-v1.19** 1. 新增自研Focusing Diffusion Pyramid Network. 2. 新增HCFNet针对小目标分割的Parallelized Patch-Aware Attention Module改进C2f. 3. 新增HCFNet针对小目标分割的Dimension-Aware Selective Integration Module对自研Focusing Diffusion Pyramid Network再次进行创新. 4. 更新使用教程. 5. 百度云视频增加20240412更新说明. - **20240427-rtdetr-v1.20** 1. 新增mobilenetv4-backbone. 2. 新增A Robust Feature Downsampling Module for Remote Sensing Visual Tasks中的下采样. 3. 新增Context and Spatial Feature Calibration for Real-Time Semantic Segmentation中的Context and Spatial Feature Calibration. 4. 更新使用教程. 5. 百度云视频增加20240427更新说明. - **20240502-rtdetr-v1.21** 1. 新增支持content-guided attention fusion改进rtdetr-neck. 2. 新增支持使用CAFM对CGAFusion进行二次改进,得到CAFMFusion改进rtdetr-neck. 3. get_FPS.py脚本新增可以通过yaml测试推理速度. 4. 新增自研RGCSPELAN,其比C3、ELAN、C2f、RepNCSPELAN更低参数量和计算量更快推理速度. 5. 更新使用教程. 6. 百度云视频增加20240502更新说明. - **20240518-rtdetr-v1.22** 1. 新增CVPR2024-StarNet-Backbone以及其衍生的改进(C3-Star、C3-Star-CAA、C2f-Star、C2f-Star-CAA、BasicBlock_Star、BottleNeck_Star). 2. 使用CVPR2024-TransNext中的Convolutional GLU对CVPR2023-FasterBlock进行二次创新(C3_Faster_CGLU, C2f_Faster_CGLU, BasicBlock_Faster_Block_CGLU, BottleNeck_Faster_Block_CGLU). 3. 新增PSFusion中的superficial detail fusion module、profound semantic fusion module. 4. 更新使用教程. 5. 百度云视频增加20240518更新说明. - **20240525-rtdetr-v1.23** 1. KAN In! Mamba Out!,集成pytorch-kan-conv,支持多种KAN变种! 2. 同步DCNV4-CVPR2024最新代码. 3. 更新使用教程. 4. 百度云视频增加20240525更新说明. - **20240608-rtdetr-v1.24** 1. 新增自研ContextGuideFPN. 2. 新增detail-enhanced convolution改进RTDETR. 3. 新增自研SMPCGLU,里面的模块分别来自CVPR2023和CVPR2024. 4. 更新使用教程. 5. 百度云视频增加20240608更新说明. - **20240618-rtdetr-v1.25** 1. 新增支持物理传热启发的视觉表征模型vHeat中的vHeatBlock. 2. 新增自研重校准特征金字塔网络(Re-CalibrationFPN),推出多个版本(P2345,P345,P3456). 3. 新增WaveletPool改进上采样和下采样. 4. 更新使用教程. 5. 百度云视频增加20240618更新说明. - **20240622-rtdetr-v1.26** 1. 新增RtDetr-Mamba. 2. 新增GLSA改进rtdetr-neck. 3. 新增GLSA对BIFPN进行二次创新. 4. 更新使用教程. 5. 百度云视频增加20240622更新说明. - **20240703-rtdetr-v1.27** 1. 新增UCTransNet中的ChannelTransformer改进rtdetr-neck. 2. 新增自研SmallObjectEnhancePyramid. 3. 新增SwiftFormer的EfficientAdditiveAttention改进AIFI. 4. 更新使用教程. 5. 百度云视频增加20240703更新说明. - **20240715-rtdetr-v1.28** 1. 新增自研Context-Guided Spatial Feature Reconstruction Feature Pyramid Network. 2. 新增Wavelet Convolutions for Large Receptive Fields中的WTConv改进BasicBlock. 3. 新增UBRFC-Net中的Adaptive Fine-Grained Channel Attention. 4. 更新使用教程. 5. 百度云视频增加20240715更新说明. - **20240725-rtdetr-v1.29** 1. 新增ECCV2024-SMFANet中的Feature Modulation block. 2. 新增Rethinking Performance Gains in Image Dehazing Networks中的gConvblock. 3. 更新使用教程. 4. 百度云视频增加20240725更新说明. - **20240802-rtdetr-v1.30** 1. 新增LDConv. 2. 新增MAF-YOLO中的MAFPN,并利用BIFPN的思想对MAFPN进行二次创新得到BIMAFPN. 3. 更新使用教程. 4. 百度云视频增加20240802更新说明. - **20240815-rtdetr-v1.31** 1. 新增YOLO-MIF中的WDBB、DeepDBB的重参数化模块. 2. 新增SLAB中的RepBN改进AIFI. 3. 更新使用教程. 4. 百度云视频增加20240815更新说明. - **20240825-rtdetr-v1.32** 1. 新增CAS-ViT中的AdditiveBlock和CSP思想改进backbone. 2. 新增CAS-ViT中的AdditiveBlock改进AIFI. 3. 新增自研Efficient Multi-Branch&Scale FPN. 4. 更新使用教程. 5. 百度云视频增加20240825更新说明. - **20240902-rtdetr-v1.33** 1. 新增CMTFUnet和TransNext的二次创新模块. 2. 新增自研CSP-Partial Multi-Scale Feature Aggregation. 3. 更新使用教程. 4. 百度云视频增加20240902更新说明. - **20240912-rtdetr-v1.34** 1. 新增Cross-Layer Feature Pyramid Transformer for Small Object Detection in Aerial Images中的CFPT. 2. 新增ICLR2024中的MogaBlock. 3. 更新使用教程. 4. 百度云视频增加20240912更新说明. - **20240926-rtdetr-v1.35** 1. 新增CVPR2024-SHViT中的SHSABlock和其的二次创新. 2. 新增BIBM2024-SMAFormer中的SMAFormerBlock和其的二次创新. 3. 新增TPAMI2024-FreqFusion中的FreqFusion改进Neck. 4. 新增自研MutilBackBone-DynamicAlignFusion. 5. 更新使用教程. 6. 百度云视频增加20240926更新说明. - **20241020-rtdetr-v1.36** 1. 新增Histoformer ECCV2024中的Dynamic-range Histogram Self-Attention改进AIFI. 2. 新增自研CSP-MutilScaleEdgeInformationEnhance. 3. 新增Efficient Frequency-Domain Image Deraining with Contrastive Regularization ECCV2024中的Fused_Fourier_Conv_Mixer与CSP思想结合改进rtdetr-backbone. 4. 更新使用教程. 5. 百度云视频增加20241020更新说明. - **20241106-rtdetr-v1.37** 1. 新增自研CSP-FreqSpatial. 2. 新增SFHformer ECCV2024中的block与CSP思想结合改进 rtdetr-backbone. 3. 新增Revitalizing Convolutional Network for Image Restoration TPAMI2024中的MSM与CSP思想结合改进rtdetr-backbone. 4. 更新使用教程. 5. 百度云视频增加20241106更新说明. - **20241118-rtdetr-v1.38** 1. 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新得到CSP-MutilScaleEdgeInformationSelect. 2. 新增Pattern Recognition 2024|DRANet中的HDRAB和RAB模块与CSP思想结合改进rtdetr-backbone. 3. 新增ECCV2022-ELAN中的Local feature extraction改进RepC3. 4. 更新使用教程. 5. 百度云视频增加20241118更新说明. - **20241130-rtdetr-v1.39** 1. 新增自研GlobalEdgeInformationTransfer. 2. 新增FreqFormer的Frequency-aware Cascade Attention与CSP结合改进backbone. 3. 更新使用教程. 4. 百度云视频增加20241130更新说明. - **20241215-rtdetr-v1.40** 1. 新增CrossFormer中的DynamicPosBias-Attention改进AIFI. 2. 新增CAMixerSR中的CAMixer与CSP结合改进backbone. 3. 修改保存模型规则,原本为fp16变成fp32,详细请看本期更新视频. 4. 百度云视频增加20241215更新说明. - **20241216-rtdetr-v1.41** 1. 新增Hyper-YOLO中的Hypergraph Computation in Semantic Space和Mixed Aggregation Network改进rtdetr. 2. 修复已知bug. 3. 更新使用教程. 4. 百度云视频增加20241216更新说明. - **20241228-rtdetr-v1.42** 1. 新增基于Hyper-YOLO中的Mixed Aggregation Network三个二次改进系列. 2. 新增使用MSA^2 Net中的Multi-Scale Adaptive Spatial Attention Gate改进rtdetr-neck. 3. 新增使用MSA^2 Net中的Multi-Scale Adaptive Spatial Attention Gate改进自研系列的MutilBackbone. 4. 更新使用教程. 5. 百度云视频增加20241228更新说明. - **20250111-rtdetr-v1.43** 1. 新增CRAFT-SR中的high-frequency enhancement residual block与CSP结合改进backbone. 2. 新增AAAI2025-TBSN中的DTAB改进backbone、AIFI. 3. 新增ECCV2024-FSEL中的多个模块改进rtdetr. 4. 新增ACMMM2024-WFEN中的多个模块改进rtdetr. 5. 更新使用教程. 6. 百度云视频增加20250111更新说明. - **20250119-rtdetr-v1.44** 1. 新增AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection中的Pinwheel-shaped Convolution类型改进. 2. 新增AAAI2025 ConDSeg中的ContrastDrivenFeatureAggregation与ACMMM2024 WFEN中的小波变换进行创新. 3. 更新使用教程. 4. 百度云视频增加20250119更新说明. - **20250204-rtdetr-v1.45** 1. 新增ELGC-Net的改进及其二次创新. 2. 新增ICLR2025 PolaFormer中的PolaAttention改进AIFI. 3. 新增遥感目标检测Strip R-CNN中的StripBlock及其二次创新. 4. 新增BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation中的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention. 5. 更新使用教程. 6. 百度云视频增加20250204更新说明. - **20250206-rtdetr-v1.46** 1. 新增ICLR2025 Kolmogorov-Arnold Transformer中的KAT及其配合FasterBlock的二次创新.<此模块需要编译> 2. 更新使用教程. 3. 百度云视频增加20250206更新说明. - **20250216-rtdetr-v1.47** 1. 新增自研模块DynamicInceptionDWConv2d. 2. 新增GlobalFilter和DynamicFilter. 3. 更新使用教程. 4. 百度云视频增加20250216更新说明. - **20250303-rtdetr-v1.48** 1. 新增自研模块Hierarchical Attention Fusion并提供多种使用方式. 2. 新增ICLR2025-Token Statistics Transformer中的TSSA改进AIFI. 3. 新增MHAF-YOLO中的RepHMS.<这个是YOLO群内的一个博士新作品> 4. 更新使用教程. 5. 百度云视频增加20250303更新说明. - **20250315-rtdetr-v1.49** 1. 新增CVPR2024-Adaptive Sparse Transformer的模块改进aifi. 2. 新增CVPR2025-MambaIR的模块. 3. 新增CVPR2025-SCSegamba中的模块. 4. 新增CVPR2025-MambaOut中的模块. 5. 新增CVPR2025-DEIM MAL损失函数. 6. 更新使用教程. 7. 百度云视频增加20250315更新说明. - **20250403-rtdetr-v1.50** 1. 新增CVPR2025-MambaOut与CVPR2024-UniRepLKNet二次创新后的模块. 2. 新增CVPR2025-EfficientViM和其与CVPR2024-TransNeXt的二次创新后的模块. 3. 新增CVPR2024-EMCAD中的EUCB. 4. 新增CVPR2025-BHViT中的ShiftChannelMix和CVPR2024-EMCAD中的EUCB二次创新模块. 5. 新增rtdetr-EMBSFPN.yaml方案上引入[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix. 6. 新增CVPR2025-HVI中的Intensity Enhancement Layer. 7. 新增CVPR2025-OverLock中的模块. 8. 更新使用教程. 9. 百度云视频增加20250403更新说明. - **20250420-rtdetr-v1.51** 1. 新增ICLR2024-FTIC中的多个模块、以及其与ICLR2025-PolaFormer的二次创新模块. 2. 新增CVPR2024-DCMPNet中的多个模块. 3. 新增ICLR2025-PolaFormer与CVPR2024-TransNext的二次创新模块. 4. 新增CVPR2025-OverLock中的GDSAFusion. 5. 新增统计配置文件的计算量和参数量并排序的脚本. 6. 更新使用教程. 7. 百度云视频增加20250420更新说明. - **20250508-rtdetr-v1.52** 1. 新增CVPR2025-MobileMamba的相关改进. 2. 新增LEGNet中的LFEModule和LoGStem改进. 3. 新增WACV2025-SEMNet中的Snake Bi-Directional Sequence Modelling (SBSM)和Spatially-Enhanced Feedforward Network (SEFN)的多个改进,并含有二次创新相关内容. 4. 新增CVPR2025-LSNet中的多个改进,并含有二次创新相关内容. 5. 新增CVPR2025-DynamicTan中的多个改进,并含有二次创新相关内容. 6. 更新使用教程. 7. 百度云视频增加20250508更新说明. - **20250523-rtdetr-v1.53** 1. 新增TransMamba中的多个改进. 2. 新增CVPR2025-EVSSM中的多个改进. 3. 新增CVPR2025-DarkIR中的多个改进. 4. 更新使用教程. 5. 百度云视频增加20250523更新说明. - **20250606-rtdetr-v1.54** 1. 新增CVPR2025-FDConv的改进及其多个二次创新模块. 2. 新增DSA: Deformable Spatial Attention的改进及其多个二次创新模块. 3. 新增CVPR2025-MaIR中的Residual Mamba Block. 4. 更新使用教程. 5. 百度云视频增加20250606更新说明. - **20250622-rtdetr-v1.55** 1. 新增ECCV2024-rethinkingfpn中的模块,并对原创改进SOEP再次创新。 2. 新增CVPR2024-SFSConv的改进及其多个二次创新模块. 3. 新增CVPR2025-GroupMamba中的模块. 4. 新增CVPR2025-MambaVision中的模块. 5. 新增AAAI2025-FBRTYOLO中的模块. 5. 更新使用教程. 6. 百度云视频增加20250622更新说明. 7. 修复在torch2.6.0以及以上的版本会出现模型读取失败的问题. - **20250711-rtdetr-v1.56** 1. 新增Pyramid Sparse Transformer改进rtdetr-neck. 2. 新增Pyramid Sparse Transformer对SOEP再创新. 3. 新增weightedConvolution2.0. 4. 新增MIA2025-FourierConv. 5. 新增AAAI2025的HS-FPN. 6. 更新使用教程. 7. 百度云视频增加20250711更新说明. - **20250727-rtdetr-v1.57** 1. 新增ICCV2025-ESC中的模块. 2. 新增ICCV2025-MobileIE中的模块. 3. 新增ICCV2025-VSSD中的模块. 4. 新增ICCV2025-TinyVIM中的模块. 5. 新增MSLA. 6. 新增INFFUS2025-SAMamba中的模块. 7. 新增TGRS2025-UMFormer中模块. 8. 更新使用教程. 9. 百度云视频增加20250727更新说明. - **20250815-rtdetr-v1.58** 1. 新增CPRAformer中的EPGO多个改进。 2. 新增ICCV2025-ESC中的ConvAttn改进。 3. 更新使用教程. 4. 百度云视频增加20250815更新说明. - **20250829-rtdetr-v1.59** 1. 新增ICCV2025-UniConvBlock中的模块. 2. 新增ICCV2025-ConverseBNet中的模块. 3. 新增ACM MM 2025-Mobile U-ViT中的模块. 4. 更新使用教程. 5. 百度云视频增加20250829更新说明. - **20250914-rtdetr-v1.60** 1. 新增CVPR2025-GCConv模块. 2. 新增AAAI2024-CFBlock模块. 3. 新增ICCV2023-FastViT中的RepStem模块. 4. 更新使用教程. 5. 百度云视频增加20250914更新说明. - **20251008-rtdetr-v1.61** 1. 新增IJCV2024-SRConvNet中的模块. 2. 新增LWGANet中的模块. 3. 更新使用教程. 4. 百度云视频增加20251008更新说明. - **20251028-rtdetr-v1.62** 1. 新增TGRS2025-ASCNet中的模块. 2. 新增ICCV2025-HFRB模块. 3. 新增ICIP2025-BEVANET中的模块. 4. 新增TPAMI2025-LRFormer中的模块. 5. 新增ICCV2025-Rectifying Magnitude Neglect in Linear Attention的模块. 6. 更新使用教程. 7. 百度云视频增加20251028更新说明. - **20251122-rtdetr-v1.63** 1. 新增GRSL2025-Gaussian Combined Distance,详细请看LOSS改进系列.md. 2. 新增ACCV2024-PlainUSR中的模块. 3. 更新使用教程. 4. 百度云视频增加20251122更新说明. - **20251219-rtdetr-v1.64** 1. 新增CVPR2025-HVI中的LCA模块. 2. 新增TIP2025-SFMB模块. 3. 新增TGRS2025-HAFNet中的HFFE模块. 4. 更新使用教程. 5. 百度云视频增加20251219更新说明. - **20260114-rtdetr-v1.65** 1. 新增YOLO-Master中的MoE模块. 2. 新增ACMMM2025-FlickCD中的模块. 3. 更新使用教程. 4. 百度云视频增加20260114更新说明. - **20260203-rtdetr-v1.66** 1. 新增TGRS2025-Think Locally and Act Globally中的模块. 2. 新增TGRS2025-ISGLNet中的多个模块. 3. 新增TGRS2025-MASFNet中的模块. 4. 更新使用教程. 5. 百度云视频增加20260203更新说明. - **20260224-rtdetr-v1.67** 1. 新增MICCAI2023-SHISRCNet中的模块. 2. 新增AAAI2026-Partial Channel Network中的模块. 3. 新增TGRS2025-DRPCANet中的模块. 4. 新增TGRS2025-ISGLNet中的模块. 5. 新增TGRS2025-HDNet中的模块. 6. 更新使用教程. 7. 百度云视频增加20260223更新说明. - **20260307-rtdetr-v1.68** 1. 增加训练过程中的mAP75输出. 2. 优化detect.py中的特征图保存机制,使其可以单独保存每一个通道的特征图和总通道求和的特征图. - **20260321-rtdetr-v1.69** 1. 新增AAAI2026-SPJFBlock模块. 2. 新增TGRS2025-GLVMamba中的GLSS2D模块. 3. 新增TIP2025-DSMT中的CAFM模块. 4. 新增TGRS2025-USTNet中的DWMMSA模块. 5. 新增CVPR2026-MixerCSeg中的DEGConv模块. 6. 新增CVPR2026-BinaryAttention的模块. 7. 新增CVPR2026-TransMixer模块. 8. 新增CVPR2025-Wavelet and Prototype Augmented Query-based Transformer for Pixel-level Surface Defect Detection中的WCA模块. 9. 更新使用教程. 10. 百度云视频增加20260321更新说明. 11. 修复一些失效的链接. ================================================ FILE: yolo-improve/ultralytics-yolo/get_COCO_metrice.py ================================================ import warnings warnings.filterwarnings('ignore') import argparse from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from tidecv import TIDE, datasets # COCO指标如果一直生成不出来之类的问题可以看这期视频排查:https://www.bilibili.com/video/BV1SdNizEE4X/ # 出现缺失的info健的问题请装pycocotools==2.0.8 def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument('--anno_json', type=str, default='data.json', help='label coco json path') # 数据集coco格式的json标签文件 parser.add_argument('--pred_json', type=str, default='', help='pred coco json path') # 数据集coco格式的json模型推理文件 return parser.parse_known_args()[0] if __name__ == '__main__': opt = parse_opt() anno_json = opt.anno_json pred_json = opt.pred_json anno = COCO(anno_json) # init annotations api pred = anno.loadRes(pred_json) # init predictions api eval = COCOeval(anno, pred, 'bbox') eval.evaluate() eval.accumulate() eval.summarize() tide = TIDE() tide.evaluate_range(datasets.COCO(anno_json), datasets.COCOResult(pred_json), mode=TIDE.BOX) tide.summarize() tide.plot(out_dir='tide_result') ================================================ FILE: yolo-improve/ultralytics-yolo/heatmap.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil, sys, copy torch.autograd.set_detect_anomaly(True) import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from ultralytics import YOLO from ultralytics.nn.modules.head import Pose, Pose26 from ultralytics.utils.nms import non_max_suppression from ultralytics.utils import LOGGER from pytorch_grad_cam import GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM, KPCA_CAM, AblationCAM from pytorch_grad_cam.utils.image import show_cam_on_image, scale_cam_image from pytorch_grad_cam.activations_and_gradients import ActivationsAndGradients RED, GREEN, BLUE, YELLOW, ORANGE, CYAN, MAGENTA, BOLD, RESET = "\033[91m", "\033[92m", "\033[94m", "\033[93m", "\033[38;5;208m", "\033[96m", "\033[95m", "\033[1m", "\033[0m" def patch_pose_classes_for_gradcam(): """修复 Pose 和 Pose26 类使其兼容 Grad-CAM,移除 inplace 操作""" # 修复 Pose 类 def pose_kpts_decode_no_inplace(self, kpts: torch.Tensor) -> torch.Tensor: """Decode keypoints from predictions (no inplace operations).""" ndim = self.kpt_shape[1] bs = kpts.shape[0] if self.export: y = kpts.view(bs, *self.kpt_shape, -1) a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides if ndim == 3: a = torch.cat((a, y[:, :, 2:3].sigmoid()), 2) return a.view(bs, self.nk, -1) else: y = kpts.clone() if ndim == 3: # 强制使用非 inplace 操作 y[:, 2::ndim] = y[:, 2::ndim].sigmoid() y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides return y # 修复 Pose26 类 def pose26_kpts_decode_no_inplace(self, kpts: torch.Tensor) -> torch.Tensor: """Decode keypoints from predictions (no inplace operations).""" ndim = self.kpt_shape[1] bs = kpts.shape[0] if self.export: y = kpts.view(bs, *self.kpt_shape, -1) # NCNN fix a = (y[:, :, :2] + self.anchors) * self.strides if ndim == 3: a = torch.cat((a, y[:, :, 2:3].sigmoid()), 2) return a.view(bs, self.nk, -1) else: y = kpts.clone() if ndim == 3: # 强制使用非 inplace 操作 y[:, 2::ndim] = y[:, 2::ndim].sigmoid() y[:, 0::ndim] = (y[:, 0::ndim] + self.anchors[0]) * self.strides y[:, 1::ndim] = (y[:, 1::ndim] + self.anchors[1]) * self.strides return y # 应用补丁 Pose.kpts_decode = pose_kpts_decode_no_inplace Pose26.kpts_decode = pose26_kpts_decode_no_inplace patch_pose_classes_for_gradcam() def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32): # Resize and pad image while meeting stride-multiple constraints shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if auto: # minimum rectangle dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border return im, ratio, (top, bottom, left, right) class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targetted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook(self.save_activation)) # Because of https://github.com/pytorch/pytorch/issues/61519, # we don't use backward hook to record gradients. self.handles.append( target_layer.register_forward_hook(self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, input, output): if not hasattr(output, "requires_grad") or not output.requires_grad: # You can only register hooks on tensor requires grad. return # Gradients are computed in reverse order def _store_grad(grad): if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients output.register_hook(_store_grad) def post_process(self, result): if self.model.end2end: logits_ = result[:, :, 4:] boxes_ = result[:, :, :4] sorted, indices = torch.sort(logits_[:, :, 0], descending=True) return logits_[0][indices[0]], boxes_[0][indices[0]] elif self.model.task == 'detect': logits_ = result[:, 4:] boxes_ = result[:, :4] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'segment': logits_ = result[0][0][:, 4:4 + self.model.nc] boxes_ = result[0][0][:, :4] mask_p, mask_nm = result[0][1].squeeze(), result[0][0][:, 4 + self.model.nc:].squeeze().transpose(1, 0) c, h, w = mask_p.size() mask = (mask_nm @ mask_p.view(c, -1)) sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], mask[indices[0]] elif self.model.task == 'pose': logits_ = result[:, 4:4 + self.model.nc] boxes_ = result[:, :4] poses_ = result[:, 4 + self.model.nc:] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(poses_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'obb': logits_ = result[:, 4:4 + self.model.nc] boxes_ = result[:, :4] angles_ = result[:, 4 + self.model.nc:] sorted, indices = torch.sort(logits_.max(1)[0], descending=True) return torch.transpose(logits_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(boxes_[0], dim0=0, dim1=1)[indices[0]], torch.transpose(angles_[0], dim0=0, dim1=1)[indices[0]] elif self.model.task == 'classify': return result[0] def __call__(self, x): self.gradients = [] self.activations = [] model_output = self.model(x) if self.model.task == 'detect': post_result, pre_post_boxes = self.post_process(model_output[0]) return [[post_result, pre_post_boxes]] elif self.model.task == 'segment': post_result, pre_post_boxes, pre_post_mask = self.post_process(model_output) return [[post_result, pre_post_boxes, pre_post_mask]] elif self.model.task == 'pose': post_result, pre_post_boxes, pre_post_pose = self.post_process(model_output[0]) return [[post_result, pre_post_boxes, pre_post_pose]] elif self.model.task == 'obb': post_result, pre_post_boxes, pre_post_angle = self.post_process(model_output[0]) return [[post_result, pre_post_boxes, pre_post_angle]] elif self.model.task == 'classify': data = self.post_process(model_output) return [data] def release(self): for handle in self.handles: handle.remove() class yolo_detect_target(torch.nn.Module): def __init__(self, ouput_type, conf, ratio, end2end) -> None: super().__init__() self.ouput_type = ouput_type self.conf = conf self.ratio = ratio self.end2end = end2end @staticmethod def _accumulate(acc, value): return value if acc is None else acc + value @staticmethod def _zero_scalar_like(tensor): # Keep the zero target connected to autograd graph so Grad-CAM layers receive zero (not None) gradients. return tensor.sum() * 0.0 def forward(self, data): post_result, pre_post_boxes = data acc = None loop_count = min(int(post_result.size(0) * self.ratio), post_result.size(0)) for i in trange(loop_count): if (self.end2end and float(post_result[i, 0]) < self.conf) or (not self.end2end and float(post_result[i].max()) < self.conf): break if self.ouput_type in ("class", "all"): acc = self._accumulate(acc, post_result[i, 0] if self.end2end else post_result[i].max()) if self.ouput_type in ("box", "all"): for j in range(4): acc = self._accumulate(acc, pre_post_boxes[i, j]) return acc if acc is not None else self._zero_scalar_like(post_result) class yolo_segment_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_mask = data acc = None loop_count = min(int(post_result.size(0) * self.ratio), post_result.size(0)) for i in trange(loop_count): if float(post_result[i].max()) < self.conf: break if self.ouput_type in ("class", "all"): acc = self._accumulate(acc, post_result[i].max()) if self.ouput_type in ("box", "all"): for j in range(4): acc = self._accumulate(acc, pre_post_boxes[i, j]) if self.ouput_type in ("segment", "all"): acc = self._accumulate(acc, pre_post_mask[i].mean()) return acc if acc is not None else self._zero_scalar_like(post_result) class yolo_pose_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_pose = data acc = None loop_count = min(int(post_result.size(0) * self.ratio), post_result.size(0)) for i in trange(loop_count): if float(post_result[i].max()) < self.conf: break if self.ouput_type in ("class", "all"): acc = self._accumulate(acc, post_result[i].max()) if self.ouput_type in ("box", "all"): for j in range(4): acc = self._accumulate(acc, pre_post_boxes[i, j]) if self.ouput_type in ("pose", "all"): acc = self._accumulate(acc, pre_post_pose[i].mean()) return acc if acc is not None else self._zero_scalar_like(post_result) class yolo_obb_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): post_result, pre_post_boxes, pre_post_angle = data acc = None loop_count = min(int(post_result.size(0) * self.ratio), post_result.size(0)) for i in trange(loop_count): if float(post_result[i].max()) < self.conf: break if self.ouput_type in ("class", "all"): acc = self._accumulate(acc, post_result[i].max()) if self.ouput_type in ("box", "all"): for j in range(4): acc = self._accumulate(acc, pre_post_boxes[i, j]) if self.ouput_type in ("obb", "all"): acc = self._accumulate(acc, pre_post_angle[i]) return acc if acc is not None else self._zero_scalar_like(post_result) class yolo_classify_target(yolo_detect_target): def __init__(self, ouput_type, conf, ratio, end2end): super().__init__(ouput_type, conf, ratio, end2end) def forward(self, data): return data.max() class yolo_heatmap: def __init__(self, weight, device, method, layer, backward_type, conf_threshold, ratio, show_result, renormalize, task, img_size, letterbox_auto): device = torch.device(device) model_yolo = YOLO(weight) model_names = model_yolo.names LOGGER.info(f'{ORANGE}model class info:{model_names}{RESET}') model = copy.deepcopy(model_yolo.model) model.to(device) model.info() for p in model.parameters(): p.requires_grad_(True) model.eval() model.task = task if not hasattr(model, 'end2end'): model.end2end = False if model.end2end: model.end2end = False if task == 'detect': target = yolo_detect_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'segment': target = yolo_segment_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'pose': target = yolo_pose_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'obb': target = yolo_obb_target(backward_type, conf_threshold, ratio, model.end2end) elif task == 'classify': target = yolo_classify_target(backward_type, conf_threshold, ratio, model.end2end) else: raise Exception(f"not support task({task}).") target_layers = [model.model[l] for l in layer] cam_methods = { "GradCAMPlusPlus": GradCAMPlusPlus, "GradCAM": GradCAM, "XGradCAM": XGradCAM, "EigenCAM": EigenCAM, "HiResCAM": HiResCAM, "LayerCAM": LayerCAM, "RandomCAM": RandomCAM, "EigenGradCAM": EigenGradCAM, "KPCA_CAM": KPCA_CAM, "AblationCAM": AblationCAM, } if method not in cam_methods: raise ValueError(f"Unsupported CAM method '{method}'. Available methods: {', '.join(cam_methods)}") method = cam_methods[method](model, target_layers) method.activations_and_grads = ActivationsAndGradients(model, target_layers, None) colors = np.random.uniform(0, 255, size=(len(model_names), 3)).astype(np.int32) self.__dict__.update(locals()) def post_process(self, result): result = non_max_suppression(result, conf_thres=self.conf_threshold, iou_thres=0.65)[0] return result def draw_detections(self, box, color, name, img): xmin, ymin, xmax, ymax = list(map(int, list(box))) cv2.rectangle(img, (xmin, ymin), (xmax, ymax), tuple(int(x) for x in color), 2) # 绘制检测框 cv2.putText(img, str(name), (xmin, ymin - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, tuple(int(x) for x in color), 2, lineType=cv2.LINE_AA) # 绘制类别、置信度 return img def renormalize_cam_in_bounding_boxes(self, boxes, image_float_np, grayscale_cam): """Normalize the CAM to be in the range [0, 1] inside every bounding boxes, and zero outside of the bounding boxes. """ renormalized_cam = np.zeros(grayscale_cam.shape, dtype=np.float32) for x1, y1, x2, y2 in boxes: x1, y1 = max(x1, 0), max(y1, 0) x2, y2 = min(grayscale_cam.shape[1] - 1, x2), min(grayscale_cam.shape[0] - 1, y2) renormalized_cam[y1:y2, x1:x2] = scale_cam_image(grayscale_cam[y1:y2, x1:x2].copy()) renormalized_cam = scale_cam_image(renormalized_cam) eigencam_image_renormalized = show_cam_on_image(image_float_np, renormalized_cam, use_rgb=True) return eigencam_image_renormalized def process(self, img_path, save_path): # img process try: img = cv2.imdecode(np.fromfile(img_path, np.uint8), cv2.IMREAD_COLOR) except Exception: LOGGER.error(f"{RED}{img_path} read failure.{RESET}") return False if img is None: LOGGER.error(f"{RED}{img_path} decode failure (not an image or corrupted file).{RESET}") return False img, _, (top, bottom, left, right) = letterbox(img, new_shape=(self.img_size, self.img_size), auto=self.letterbox_auto) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 tensor = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) LOGGER.info(f'{BOLD}{ORANGE}tensor size:{tensor.size()}{RESET}') try: grayscale_cam = self.method(tensor, [self.target]) except AttributeError: LOGGER.warning(f"{CYAN}self.method(tensor, [self.target]) failure.{RESET}") return False grayscale_cam = grayscale_cam[0, :] cam_image = show_cam_on_image(img, grayscale_cam, use_rgb=True) pred = self.model_yolo.predict(tensor, conf=self.conf_threshold, iou=0.7, verbose=False)[0] if self.renormalize and self.task in ['detect', 'segment', 'pose']: cam_image = self.renormalize_cam_in_bounding_boxes(pred.boxes.xyxy.cpu().detach().numpy().astype(np.int32), img, grayscale_cam) if self.show_result: cam_image = pred.plot(img=cam_image, conf=True, # 显示置信度 font_size=None, # 字体大小,None为根据当前image尺寸计算 line_width=None, # 线条宽度,None为根据当前image尺寸计算 labels=False, # 显示标签 ) # 去掉padding边界 cam_image = cam_image[top:cam_image.shape[0] - bottom, left:cam_image.shape[1] - right] cam_image = Image.fromarray(cam_image) cam_image.save(save_path) return True def __call__(self, img_path, save_path): # remove dir if exist if os.path.exists(save_path): shutil.rmtree(save_path) # make dir if not exist os.makedirs(save_path, exist_ok=True) if os.path.isdir(img_path): success, failed = 0, 0 for img_path_ in os.listdir(img_path): ok = self.process(f'{img_path}/{img_path_}', f'{save_path}/{img_path_}') success += int(ok) failed += int(not ok) LOGGER.info(f"{BOLD}{ORANGE}processed images: success={success}, failed={failed}{RESET}") else: ok = self.process(img_path, f'{save_path}/result.png') if not ok: LOGGER.error(f"{RED}failed to process input image: {img_path}{RESET}") LOGGER.info(f'{BOLD}{MAGENTA}进度条不满是正常现象,只要进度条不是0,都可以进行出图.{RESET}') def get_params(): params = { 'weight': 'yolo26n.pt', # 现在只需要指定权重即可,不需要指定cfg 'device': 'cuda:0', 'method': 'GradCAMPlusPlus', # GradCAMPlusPlus, GradCAM, XGradCAM, EigenCAM, HiResCAM, LayerCAM, RandomCAM, EigenGradCAM, KPCA_CAM 'layer': [16, 19, 22], 'backward_type': 'all', # detect: segment: pose: obb: classify: 'conf_threshold': 0.2, # 0.2 'ratio': 0.02, # 0.02-0.1 'show_result': True, # 不需要绘制结果请设置为False 'renormalize': False, # 需要把热力图限制在框内请设置为True(仅对detect,segment,pose有效) 'task':'detect', # 任务(detect,segment,pose,obb,classify) 'img_size':640, # 图像尺寸 'letterbox_auto': True # 如果需要固定成宽高一样就设置为False,部分改进可能需要输入的宽高一致,不然会报错 } return params # pip install grad-cam==1.5.5 --no-deps if __name__ == '__main__': model = yolo_heatmap(**get_params()) model(r'/root/dataset/coco/images/val2017/000000361238.jpg', 'heatmap_result') # model(r'/root/dataset/coco/images/val2017', 'heatmap_result') # model(r'/root/code/project/datasets/DOTAv1.5/images/test', 'heatmap_result') ================================================ FILE: yolo-improve/ultralytics-yolo/requirements.txt ================================================ PyYAML tensorboard scipy thop transformers einops prettytable PyWavelets polars ================================================ FILE: yolo-improve/ultralytics-yolo/train.py ================================================ import warnings, os, sys sys.path.append(os.path.dirname(os.path.abspath(__file__))) warnings.filterwarnings('ignore') from ultralytics import YOLO # BILIBILI UP 魔傀面具 # 训练参数官方详解链接:https://docs.ultralytics.com/modes/train/#resuming-interrupted-trainings:~:text=a%20training%20run.-,Train%20Settings,-The%20training%20settings if __name__ == '__main__': yaml_path = 'ultralytics/cfg/models/26/yolo26n.yaml' # 初始化 YOLO 模型,从 yaml 配置文件构建网络结构 model = YOLO(yaml_path) # model.load('yolo26n.pt') # 加载预训练权重,一般都不建议加载 model.train(data='/root/dataset/dataset_visdrone/data.yaml', # 数据集配置文件路径 cache=False, # 是否缓存图像到内存以加快训练速度。False=不缓存,True=缓存到RAM(很吃内存,内存少的慎开),'disk'=缓存到磁盘(吃硬盘空间) imgsz=640, # 输入图像尺寸(像素) epochs=300, # 训练总轮数 batch=16, # 批次大小 close_mosaic=0, # 最后多少个 epoch 关闭 Mosaic 数据增强。设置 0 代表全程开启 Mosaic 训练 workers=4, # 数据加载的工作线程数。Windows 下出现卡顿或奇怪错误可尝试设置为 0 device='0', # 训练设备选择。'0' 代表使用第一块 GPU,'cpu' 为 CPU,'0,1,2' 为多 GPU optimizer='MuSGD' if 'yolo26' in yaml_path else 'SGD', # 优化器选择。YOLO26 使用官方推荐的 MuSGD,其他模型使用 SGD patience=50, # 早停机制的耐心值。连续 50 个 epoch 验证指标未提升则停止训练。设置 0 关闭早停 # resume=True, # 断点续训,需要在 YOLO 初始化时加载 last.pt 权重文件 amp=True, # 是否启用自动混合精度(Automatic Mixed Precision)训练,默认为 True | loss出现nan可以关闭amp # fraction=0.2, # 设置0.2代表只选择百分之20的数据进行训练 cos_lr=False, # 是否使用余弦退火学习率调度器,默认为 False save_period=-1, # 每隔多少个 epoch 保存一次 checkpoint(默认 -1 表示禁用,仅保存最好和最后的) project='train', # 训练结果保存的项目目录 name='exp', # 本次实验的名称,(若已存在则自动创建 exp2, exp3...) ) ================================================ FILE: yolo-improve/ultralytics-yolo/val.py ================================================ import warnings warnings.filterwarnings('ignore') import os import numpy as np from prettytable import PrettyTable from ultralytics import YOLO from ultralytics.utils.torch_utils import model_info # BILIBILI UP 魔傀面具 # 验证参数官方详解链接:https://docs.ultralytics.com/modes/val/#usage-examples:~:text=of%20each%20category-,Arguments%20for%20YOLO%20Model%20Validation,-When%20validating%20YOLO # 最终论文的参数量和计算量统一以这个脚本运行出来的为准 def get_weight_size(path): stats = os.stat(path) return f'{stats.st_size / 1024 / 1024:.1f}' if __name__ == '__main__': model_path = '' model = YOLO(model_path) # 选择训练好的权重路径 result = model.val(data='data.yaml', split='test', # split可以选择train、val、test 根据自己的数据集情况来选择. imgsz=640, batch=16, # iou=0.7, project='val', name='exp', # end2end=False # 如果训练的是NMSFree类型的模型,不想用一对一的头可以设置False ) if model.task == 'detect': # 仅目标检测任务适用 需要改别的任务可以看:https://www.bilibili.com/video/BV1dBQDY6Ec5/ length = result.box.p.size model_names = list(result.names.values()) preprocess_time_per_image = result.speed['preprocess'] inference_time_per_image = result.speed['inference'] postprocess_time_per_image = result.speed['postprocess'] all_time_per_image = preprocess_time_per_image + inference_time_per_image + postprocess_time_per_image n_l, n_p, n_g, flops = model_info(model.model) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) print('-'*20 + '论文上的数据以以下结果为准' + '-'*20) model_info_table = PrettyTable() model_info_table.title = "Model Info" model_info_table.field_names = ["GFLOPs", "Parameters", "前处理时间/一张图", "推理时间/一张图", "后处理时间/一张图", "FPS(前处理+模型推理+后处理)", "FPS(推理)", "Model File Size"] model_info_table.add_row([f'{flops:.1f}', f'{n_p:,}', f'{preprocess_time_per_image / 1000:.6f}s', f'{inference_time_per_image / 1000:.6f}s', f'{postprocess_time_per_image / 1000:.6f}s', f'{1000 / all_time_per_image:.2f}', f'{1000 / inference_time_per_image:.2f}', f'{get_weight_size(model_path)}MB']) print(model_info_table) model_metrice_table = PrettyTable() model_metrice_table.title = "Model Metrice" model_metrice_table.field_names = ["Class Name", "Precision", "Recall", "F1-Score", "mAP50", "mAP75", "mAP50-95"] for idx in range(length): model_metrice_table.add_row([ model_names[idx], f"{result.box.p[idx]:.4f}", f"{result.box.r[idx]:.4f}", f"{result.box.f1[idx]:.4f}", f"{result.box.ap50[idx]:.4f}", f"{result.box.all_ap[idx, 5]:.4f}", # 50 55 60 65 70 75 80 85 90 95 f"{result.box.ap[idx]:.4f}" ]) model_metrice_table.add_row([ "all(平均数据)", f"{result.results_dict['metrics/precision(B)']:.4f}", f"{result.results_dict['metrics/recall(B)']:.4f}", f"{np.mean(result.box.f1[:length]):.4f}", f"{result.results_dict['metrics/mAP50(B)']:.4f}", f"{np.mean(result.box.all_ap[:length, 5]):.4f}", # 50 55 60 65 70 75 80 85 90 95 f"{result.results_dict['metrics/mAP50-95(B)']:.4f}" ]) print(model_metrice_table) with open(result.save_dir / 'paper_data.txt', 'w+', errors="ignore", encoding="utf-8") as f: f.write(str(model_info_table)) f.write('\n') f.write(str(model_metrice_table)) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) print('-'*20, f'结果已保存至{result.save_dir}/paper_data.txt...', '-'*20) ================================================ FILE: yolo-improve/ultralytics-yolo/yolo2coco.py ================================================ import json import os from pathlib import Path from PIL import Image class YOLOtoCOCO: def __init__(self, yolo_dir, image_dir, class_names, output_json='coco_annotations.json'): """ 初始化YOLO到COCO转换器 Args: yolo_dir: YOLO标签文件目录 image_dir: 图片文件目录 class_names: 类别名称列表,索引对应YOLO的类别ID output_json: 输出的COCO格式JSON文件路径 """ self.yolo_dir = Path(yolo_dir) self.image_dir = Path(image_dir) self.class_names = class_names self.output_json = output_json # COCO格式的基本结构 self.coco_format = { "images": [], "annotations": [], "categories": [] } self.annotation_id = 0 def create_categories(self): """创建类别信息""" for i, class_name in enumerate(self.class_names): category = { "id": i, "name": class_name, "supercategory": "object" } self.coco_format["categories"].append(category) def yolo_to_coco_bbox(self, yolo_bbox, img_width, img_height): """ 将YOLO格式的bbox转换为COCO格式 YOLO格式: [x_center, y_center, width, height] (归一化) COCO格式: [x_min, y_min, width, height] (像素值) """ x_center, y_center, width, height = yolo_bbox # 转换为像素值 x_center *= img_width y_center *= img_height width *= img_width height *= img_height # 转换为COCO格式 (左上角坐标 + 宽高) x_min = x_center - width / 2 y_min = y_center - height / 2 return [x_min, y_min, width, height] def bbox_to_segmentation(self, bbox): """ 将bbox转换为segmentation格式 矩形四个顶点,从左上角开始顺时针 Args: bbox: [x_min, y_min, width, height] Returns: segmentation: [[x1, y1, x2, y2, x3, y3, x4, y4]] """ x_min, y_min, width, height = bbox # 计算四个顶点坐标(从左上角开始顺时针) # 左上角 x1, y1 = x_min, y_min # 右上角 x2, y2 = x_min + width, y_min # 右下角 x3, y3 = x_min + width, y_min + height # 左下角 x4, y4 = x_min, y_min + height # COCO segmentation格式: [[x1, y1, x2, y2, x3, y3, x4, y4]] segmentation = [[x1, y1, x2, y2, x3, y3, x4, y4]] return segmentation def process_image(self, image_path, label_path): """处理单张图片及其标签""" # 使用文件名(不含扩展名)作为image_id image_id = image_path.stem # 读取图片获取尺寸 try: img = Image.open(image_path) img_width, img_height = img.size except Exception as e: print(f"无法读取图片 {image_path}: {e}") return # 添加图片信息 image_info = { "id": image_id, "file_name": image_path.name, "width": img_width, "height": img_height } self.coco_format["images"].append(image_info) # 读取YOLO标签文件 if not label_path.exists(): print(f"标签文件不存在: {label_path}") return with open(label_path, 'r') as f: lines = f.readlines() # 处理每个标注 for line in lines: line = line.strip() if not line: continue parts = line.split() class_id = int(parts[0]) bbox = [float(x) for x in parts[1:5]] # 转换bbox格式 coco_bbox = self.yolo_to_coco_bbox(bbox, img_width, img_height) # 计算面积 area = coco_bbox[2] * coco_bbox[3] # 生成segmentation(矩形四个顶点) segmentation = self.bbox_to_segmentation(coco_bbox) # 创建标注信息 annotation = { "id": self.annotation_id, "image_id": image_id, "category_id": class_id, "bbox": coco_bbox, "area": area, "iscrowd": 0, "segmentation": segmentation } self.coco_format["annotations"].append(annotation) self.annotation_id += 1 def convert(self): """执行转换""" print("开始转换YOLO格式到COCO格式...") # 创建类别信息 self.create_categories() # 获取所有图片文件 image_extensions = ['.jpg', '.jpeg', '.png', '.bmp'] image_files = [] for ext in image_extensions: image_files.extend(self.image_dir.glob(f'*{ext}')) image_files.extend(self.image_dir.glob(f'*{ext.upper()}')) print(f"找到 {len(image_files)} 张图片") # 处理每张图片 for image_path in image_files: # 对应的标签文件 label_path = self.yolo_dir / f"{image_path.stem}.txt" self.process_image(image_path, label_path) # 保存为JSON文件 with open(self.output_json, 'w', encoding='utf-8') as f: json.dump(self.coco_format, f, indent=2, ensure_ascii=False) print(f"转换完成!") print(f"图片数量: {len(self.coco_format['images'])}") print(f"标注数量: {len(self.coco_format['annotations'])}") print(f"类别数量: {len(self.coco_format['categories'])}") print(f"输出文件: {self.output_json}") # 使用示例 if __name__ == "__main__": # 配置参数 yolo_label_dir = "/root/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/labels" # YOLO标签文件目录 image_dir = "/root/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/images" # 图片目录 # 类别名称列表(索引对应YOLO的类别ID) class_names = ['pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'] output_json = "/root/dataset/dataset_visdrone/VisDrone2019-DET-test-dev/coco_annotations.json" # 输出文件名 # 创建转换器并执行转换 converter = YOLOtoCOCO( yolo_dir=yolo_label_dir, image_dir=image_dir, class_names=class_names, output_json=output_json ) converter.convert() ================================================ FILE: yolo-improve/yolov11-project.md ================================================ # [基于Ultralytics的YOLO11|YOLO12改进项目.(69.9¥)](https://github.com/z1069614715/objectdetection_script) #### 因为YOLO11和YOLO12的结构高度相似,所以YOLO12的配置文件都可以从YOLO11修改过去,项目内有标注视频链接! # 目前自带的一些改进方案(目前拥有合计420+个改进点!持续更新!) # 为了感谢各位对本项目的支持,本项目的赠品是yolov5-PAGCP通道剪枝算法.[具体使用教程](https://www.bilibili.com/video/BV1yh4y1Z7vz/) # 专栏改进汇总 ## YOLO11系列 ### 二次创新系列 1. ultralytics/cfg/models/11/yolo11-RevCol.yaml 使用(ICLR2023)Reversible Column Networks对yolo11主干进行重设计,里面的支持更换不同的C3k2-Block. 2. EMASlideLoss 使用EMA思想与SlideLoss进行相结合. 3. ultralytics/cfg/models/11/yolo11-dyhead-DCNV3.yaml 使用[DCNV3](https://github.com/OpenGVLab/InternImage)替换DyHead中的DCNV2. 4. ultralytics/cfg/models/11/yolo11-C3k2-EMBC.yaml 使用[Efficientnet](https://blog.csdn.net/weixin_43334693/article/details/131114618?spm=1001.2014.3001.5501)中的MBConv与EffectiveSE改进C3k2. 5. ultralytics/cfg/models/11/yolo11-GhostHGNetV2.yaml 使用Ghost_HGNetV2作为YOLO11的backbone. 6. ultralytics/cfg/models/11/yolo11-RepHGNetV2.yaml 使用Rep_HGNetV2作为YOLO11的backbone. 7. ultralytics/cfg/models/11/yolo11-C3k2-DWR-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)的模块进行二次创新后改进C3k2. 8. ultralytics/cfg/models/11/yolo11-ASF-P2.yaml 在ultralytics/cfg/models/11/yolo11-ASF.yaml的基础上进行二次创新,引入P2检测层并对网络结构进行优化. 9. ultralytics/cfg/models/11/yolo11-CSP-EDLAN.yaml 使用[DualConv](https://github.com/ChipsGuardian/DualConv)打造CSP Efficient Dual Layer Aggregation Networks改进yolo11. 10. ultralytics/cfg/models/11/yolo11-bifpn-SDI.yaml 使用[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对BIFPN进行二次创新. 11. ultralytics/cfg/models/11/yolo11-goldyolo-asf.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute与[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行二次创新改进yolo11的neck. 12. ultralytics/cfg/models/11/yolo11-dyhead-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)对DyHead进行二次创新.(请关闭AMP进行训练,使用教程请看20240116版本更新说明) 13. ultralytics/cfg/models/11/yolo11-HSPAN.yaml 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进yolo11的neck. 14. ultralytics/cfg/models/11/yolo11-GDFPN.yaml 使用[DAMO-YOLO](https://github.com/tinyvision/DAMO-YOLO)中的RepGFPN与[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)进行二次创新改进Neck. 15. ultralytics/cfg/models/11/yolo11-HSPAN-DySample.yaml 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN再进行创新,使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进其上采样模块. 16. ultralytics/cfg/models/11/yolo11-ASF-DySample.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion与[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)组合得到Dynamic Sample Attentional Scale Sequence Fusion. 17. ultralytics/cfg/models/11/yolo11-C3k2-DCNV2-Dynamic.yaml 利用自研注意力机制MPCA强化DCNV2中的offset和mask. 18. ultralytics/cfg/models/11/yolo11-C3k2-iRMB-Cascaded.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)中的CascadedGroupAttention对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C3k2. 19. ultralytics/cfg/models/11/yolo11-C3k2-iRMB-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C3k2. 20. ultralytics/cfg/models/11/yolo11-C3k2-iRMB-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C3k2. 21. ultralytics/cfg/models/11/yolo11-DBBNCSPELAN.yaml 使用[Diverse Branch Block CVPR2021](https://arxiv.org/abs/2103.13425)对[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行二次创新后改进yolo11. 22. ultralytics/cfg/models/11/yolo11-OREPANCSPELAN.yaml 使用[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main)对[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行二次创新后改进yolo11. 23. ultralytics/cfg/models/11/yolo11-DRBNCSPELAN.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行二次创新后改进yolo11. 24. ultralytics/cfg/models/11/yolo11-DynamicHGNetV2.yaml 使用[CVPR2024 parameternet](https://arxiv.org/pdf/2306.14525v2.pdf)中的DynamicConv对[CVPR2024 RTDETR](https://arxiv.org/abs/2304.08069)中的HGBlokc进行二次创新. 25. ultralytics/cfg/models/11/yolo11-C3k2-RVB-EMA.yaml 使用[CVPR2024 RepViT](https://github.com/THU-MIG/RepViT/tree/main)中的RepViTBlock和EMA注意力机制改进C3k2. 26. ultralytics/cfg/models/11/yolo11-ELA-HSFPN.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN. 27. ultralytics/cfg/models/11/yolo11-CA-HSFPN.yaml 使用[Coordinate Attention CVPR2021](https://github.com/houqb/CoordAttention)改进HSFPN. 28. ultralytics/cfg/models/11/yolo11-CAA-HSFPN.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块HSFPN. 29. ultralytics/cfg/models/11/yolo11-CSMHSA.yaml 对Mutil-Head Self-Attention进行创新得到Cross-Scale Mutil-Head Self-Attention. 1. 由于高维通常包含更高级别的语义信息,而低维包含更多细节信息,因此高维信息作为query,而低维信息作为key和Value,将两者结合起来可以利用高维的特征帮助低维的特征进行精细过滤,可以实现更全面和丰富的特征表达。 2. 通过使用高维的上采样信息进行Query操作,可以更好地捕捉到目标的全局信息,从而有助于增强模型对目标的识别和定位能力。 30. ultralytics/cfg/models/11/yolo11-CAFMFusion.yaml 利用具有[HCANet](https://github.com/summitgao/HCANet)中的CAFM,其具有获取全局和局部信息的注意力机制进行二次改进content-guided attention fusion. 31. ultralytics/cfg/models/11/yolo11-C3k2-Faster-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU对CVPR2023中的FasterNet进行二次创新. 32. ultralytics/cfg/models/11/yolo11-C3k2-Star-CAA.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock和[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA改进C3k2. 33. ultralytics/cfg/models/11/yolo11-bifpn-GLSA.yaml 使用[GLSA](https://github.com/Barrett-python/DuAT)模块对bifpn进行二次创新. 34. ultralytics/cfg/models/11/yolo11-BIMAFPN.yaml 利用BIFPN的思想对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次改进得到BIMAFPN. 35. ultralytics/cfg/models/11/yolo11-C3k2-AdditiveBlock-CGLU.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C3k2. 36. ultralytics/cfg/models/11/yolo11-C3k2-MSMHSA-CGLU.yaml 使用[CMTFNet](https://github.com/DrWuHonglin/CMTFNet/tree/main)中的M2SA和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C3k2. 37. ultralytics/cfg/models/11/yolo11-C3k2-IdentityFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的IdentityFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进C3k2. 38. ultralytics/cfg/models/11/yolo11-C3k2-RandomMixing-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的RandomMixing和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进C3k2. 39. ultralytics/cfg/models/11/yolo11-C3k2-PoolingFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的PoolingFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进C3k2. 40. ultralytics/cfg/models/11/yolo11-C3k2-ConvFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的ConvFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进C3k2. 41. ultralytics/cfg/models/11/yolo11-C3k2-CaFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的CaFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进C3k2. 42. ultralytics/cfg/models/11/yolo11-MAN-Faster.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新改进yolo11. 43. ultralytics/cfg/models/11/yolo11-MAN-FasterCGLU.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进yolo11. 44. ultralytics/cfg/models/11/yolo11-MAN-Star.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock进行二次创新改进yolo11. 45. ultralytics/cfg/models/11/yolo11-MutilBackbone-MSGA.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate对自研系列MutilBackbone再次创新. 46. ultralytics/cfg/models/11/yolo11-slimneck-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade改进slimneck. 47. ultralytics/cfg/models/11/yolo11-MAN-FasterCGLU-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade和[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进yolo11. 48. ultralytics/cfg/models/11/yolo11-CDFA.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的WaveletConv与[AAAI2025 ConDSeg](https://github.com/Mengqi-Lei/ConDSeg)的ContrastDrivenFeatureAggregation结合改进yolo11. 49. ultralytics/cfg/models/11/yolo11-C3k2-Faster-KAN.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN对(CVPR2023)fasternet中的FastetBlock进行二次创新. 50. ultralytics/cfg/models/11/yolo11-C3k2-ELGCACGLU.yaml 使用[ELGC-Net](https://github.com/techmn/elgcnet)中的ELGCA和和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C3k2. 51. ultralytics/cfg/models/11/yolo11-C3k2-StripCGLU.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C3k2. 52. ultralytics/cfg/models/11/yolo11-C3k2-DIMB-KAN.yaml 在ultralytics/cfg/models/11/yolo11-C3k2-DIMB.yaml的基础上把mlp模块换成[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN. 53. ultralytics/cfg/models/11/yolo11-C2TSSA-DYT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan和[ICLR2025 Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention改进C2PSA. 54. ultralytics/cfg/models/11/yolo11-C2Pola-DYT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan和[ICLR2025 PolaFormer](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention改进C2PSA. 55. ultralytics/cfg/models/12/yolo12-A2C2f-CGLU-DYT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTanh和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进A2C2f. 56. ultralytics/cfg/models/12/yolo12-A2C2f-DFFN-DYT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTanh和[FreqFormer](https://github.com/JPWang-CS/FreqFormer)中的DFFN改进A2C2f. 57. ultralytics/cfg/models/11/yolo11-C3k2-MambaOut-UniRepLK.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock和[CVPR2024 UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock二次创新后改进C3k2. 58. ultralytics/cfg/models/11/yolo11-C3k2-EfficientVIM-CGLU.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C3k2. 59. Localization Quality Estimation - Lightweight Shared Convolutional Detection Head Localization Quality Estimation模块出自[GFocalV2](https://arxiv.org/abs/2011.12885). detect:ultralytics/cfg/models/11/yolo11-LSCD-LQE.yaml seg:ultralytics/cfg/models/11/yolo11-seg-LSCD-LQE.yaml pose:ultralytics/cfg/models/11/yolo11-pose-LSCD-LQE.yaml obb:ultralytics/cfg/models/11/yolo11-obb-LSCD-LQE.yaml 60. ultralytics/cfg/models/11/yolo11-EUCB-SC.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB和[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix改进yolo11的上采样. 61. ultralytics/cfg/models/11/yolo11-EMBSFPN-SC.yaml 在ultralytics/cfg/models/11/yolo11-EMBSFPN.yaml方案上引入[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix. 62. ultralytics/cfg/models/12/yolo12-A2C2f-FMFFN-DYT.yaml 使用[ICLR2024-FTIC](https://github.com/qingshi9974/ICLR2024-FTIC)中的FMFFN和[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan对A2C2f二次创新. 63. ultralytics/cfg/models/11/yolo11-MFMMAFPN.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次创新. 64. ultralytics/cfg/models/11/yolo11-MBSMFFPN.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对yolo11-EMBSFPN.yaml再次创新 Multi-Branch&Scale Modulation-Fusion FPN. 65. ultralytics/cfg/models/11/yolo11-hyper-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的Hypergraph Computation in Semantic Space进行二次创新. 66. ultralytics/cfg/models/11/yolo11-C2TSSA-DYT-Mona-SEFN.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan和[ICLR2025 Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona和[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)改进C2PSA. 67. ultralytics/cfg/models/11/yolo11-C2TSSA-DYT-Mona.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan和[ICLR2025 Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进C2PSA. 68. ultralytics/cfg/models/12/yolo12-A2C2f-DFFN-DYT-Mona.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTanh和[FreqFormer](https://github.com/JPWang-CS/FreqFormer)中的DFFN和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进A2C2f. 69. ultralytics/cfg/models/11/yolo11-C3k2-MambaOut-LSConv.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C3k2. 70. ultralytics/cfg/models/11/yolo11-C2TSSA-DYT-Mona-SEFFN.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan和[ICLR2025 Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona和[TransMamba](https://github.com/sunshangquan/TransMamba)的SpectralEnhancedFFN改进C2PSA. 71. ultralytics/cfg/models/11/yolo11-C2TSSA-DYT-Mona-EDFFN.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTan和[ICLR2025 Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention和[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona和[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN改进C2PSA. 72. ultralytics/cfg/models/11/yolo11-C3k2-MambaOut-FDConv.yaml 使用[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C3k2. 73. ultralytics/cfg/models/11/yolo11-C3k2-PFDConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv与[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv二次创新后改进C3k2. 74. ultralytics/cfg/models/11/yolo11-C3k2-FasterFD.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的FasterBlock与[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv二次创新后改进C3k2. 75. ultralytics/cfg/models/11/yolo11-C3k2-MambaOut-DSA.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention Block与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C3k2. 76. ultralytics/cfg/models/11/yolo11-C3k2-DSAN-EDFFN.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention Block和[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN进行二次创新后改进C3k2. 77. ultralytics/cfg/models/11/yolo11-SOEP-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE对原创改进SOEP再次创新. 78. ultralytics/cfg/models/11/yolo11-SOEP-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对原创改进SOEP再次创新. 79. ultralytics/cfg/models/11/yolo11-SOEP-RFPN-MFM.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE和[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对原创改进SOEP再次创新. 80. ultralytics/cfg/models/11/yolo11-C3k2-MambaOut-SFSC.yaml 使用[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C3k2. 81. ultralytics/cfg/models/11/yolo11-C3k2-PSFSConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv与[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv二次创新后改进C3k2. 82. ultralytics/cfg/models/11/yolo11-C3k2-FasterSFSC.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的FasterBlock与[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv二次创新后改进C3k2. 83. ultralytics/cfg/models/11/yolo11-SOEP-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer对原创改进SOEP进行创新. 84. ultralytics/cfg/models/11/yolo11-C3k2-SHSA-EPGO.yaml 使用[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO改进[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSABlock. 85. ultralytics/cfg/models/11/yolo11-C3k2-SHSA-EPGO-CGLU.yaml 使用[SHViT CVPR2024](https://github.com/ysj9909/SHViT)中的SHSABlock与[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU与[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO联合创新. 86. ultralytics/cfg/models/11/yolo11-MAN-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Mixed Aggregation Network. ### 自研系列 1. ultralytics/cfg/models/11/yolo11-LAWDS.yaml Light Adaptive-weight downsampling.自研模块,具体讲解请看百度云链接中的视频. 2. ultralytics/cfg/models/11/yolo11-C3k2-EMSC.yaml Efficient Multi-Scale Conv.自研模块,具体讲解请看百度云链接中的视频. 3. ultralytics/cfg/models/11/yolo11-C3k2-EMSCP.yaml Efficient Multi-Scale Conv Plus.自研模块,具体讲解请看百度云链接中的视频. 4. Lightweight Shared Convolutional Detection Head 自研轻量化检测头. detect:ultralytics/cfg/models/11/yolo11-LSCD.yaml seg:ultralytics/cfg/models/11/yolo11-seg-LSCD.yaml pose:ultralytics/cfg/models/11/yolo11-pose-LSCD.yaml obb:ultralytics/cfg/models/11/yolo11-obb-LSCD.yaml 1. GroupNorm在FOCS论文中已经证实可以提升检测头定位和分类的性能. 2. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上. 3. 在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 综合以上,我们可以让检测头做到参数量更少、计算量更少的情况下,尽可能减少精度的损失. 5. Task Align Dynamic Detection Head 自研任务对齐动态检测头. detect:ultralytics/cfg/models/11/yolo11-TADDH.yaml seg:ultralytics/cfg/models/11/yolo11-seg-TADDH.yaml pose:ultralytics/cfg/models/11/yolo11-pose-TADDH.yaml obb:ultralytics/cfg/models/11/yolo11-obb-TADDH.yaml 1. GroupNorm在FCOS论文中已经证实可以提升检测头定位和分类的性能. 2. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上.并且在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 3. 参照TOOD的思想,除了标签分配策略上的任务对齐,我们也在检测头上进行定制任务对齐的结构,现有的目标检测器头部通常使用独立的分类和定位分支,这会导致两个任务之间缺乏交互,TADDH通过特征提取器从多个卷积层中学习任务交互特征,得到联合特征,定位分支使用DCNV2和交互特征生成DCNV2的offset和mask,分类分支使用交互特征进行动态特征选择. 6. ultralytics/cfg/models/11/yolo11-FDPN.yaml 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 1. 通过定制的特征聚焦模块与特征扩散机制,能让每个尺度的特征都具有详细的上下文信息,更有利于后续目标的检测与分类。 2. 定制的特征聚焦模块可以接受三个尺度的输入,其内部包含一个Inception-Style的模块,其利用一组并行深度卷积来捕获丰富的跨多个尺度的信息。 3. 通过扩散机制使具有丰富的上下文信息的特征进行扩散到各个检测尺度. 7. ultralytics/cfg/models/11/yolo11-FDPN-DASI.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Dimension-Aware Selective Integration Module对自研的Focusing Diffusion Pyramid Network再次创新. 8. ultralytics/cfg/models/11/yolo11-RGCSPELAN.yaml 自研RepGhostCSPELAN. 1. 参考GhostNet中的思想(主流CNN计算的中间特征映射存在广泛的冗余),采用廉价的操作生成一部分冗余特征图,以此来降低计算量和参数量。 2. 舍弃yolov5与yolo11中常用的BottleNeck,为了弥补舍弃残差块所带来的性能损失,在梯度流通分支上使用RepConv,以此来增强特征提取和梯度流通的能力,并且RepConv可以在推理的时候进行融合,一举两得。 3. 可以通过缩放因子控制RGCSPELAN的大小,使其可以兼顾小模型和大模型。 9. Lightweight Shared Convolutional Separamter BN Detection Head 基于自研轻量化检测头上,参考NASFPN的设计思路把GN换成BN,并且BN层参数不共享. detect:ultralytics/cfg/models/11/yolo11-LSCSBD.yaml seg:ultralytics/cfg/models/11/yolo11-seg-LSCSBD.yaml pose:ultralytics/cfg/models/11/yolo11-pose-LSCSBD.yaml obb:ultralytics/cfg/models/11/yolo11-obb-LSCSBD.yaml 1. 由于不同层级之间特征的统计量仍存在差异,Normalization layer依然是必须的,由于直接在共享参数的检测头中引入BN会导致其滑动平均值产生误差,而引入 GN 又会增加推理时的开销,因此我们参考NASFPN的做法,让检测头共享卷积层,而BN则分别独立计算。 10. ultralytics/cfg/models/11/yolo11-EIEStem.yaml 1. 通过SobelConv分支,可以提取图像的边缘信息。由于Sobel滤波器可以检测图像中强度的突然变化,因此可以很好地捕捉图像的边缘特征。这些边缘特征在许多计算机视觉任务中都非常重要,例如图像分割和物体检测。 2. EIEStem模块还结合空间信息,除了边缘信息,EIEStem还通过池化分支提取空间信息,保留重要的空间信息。结合边缘信息和空间信息,可以帮助模型更好地理解图像内容。 3. 通过3D组卷积高效实现Sobel算子。 11. ultralytics/cfg/models/11/yolo11-C3k2-EIEM.yaml 提出了一种新的EIEStem模块,旨在作为图像识别任务中的高效前端模块。该模块结合了提取边缘信息的SobelConv分支和提取空间信息的卷积分支,能够学习到更加丰富的图像特征表示。 1. 边缘信息学习: 卷积神经网络 (CNN)通常擅长学习空间信息,但是对于提取图像中的边缘信息可能稍显不足。EIEStem 模块通过SobelConv分支,显式地提取图像的边缘特征。Sobel滤波器是一种经典的边缘检测滤波器,可以有效地捕捉图像中强度的突然变化,从而获得重要的边缘信息。 2. 空间信息保留: 除了边缘信息,图像中的空间信息也同样重要。EIEStem模块通过一个额外的卷积分支 (conv_branch) 来提取空间信息。与SobelCon 分支不同,conv_branch提取的是原始图像的特征,可以保留丰富的空间细节。 3. 特征融合: EIEStem模块将来自SobelConv分支和conv_branch提取的特征进行融合 (concatenate)。 这种融合操作使得学习到的特征表示既包含了丰富的边缘信息,又包含了空间信息,能够更加全面地刻画图像内容。 12. ultralytics/cfg/models/11/yolo11-ContextGuideFPN.yaml Context Guide Fusion Module(CGFM)是一个创新的特征融合模块,旨在改进YOLO11中的特征金字塔网络(FPN)。该模块的设计考虑了多尺度特征融合过程中上下文信息的引导和自适应调整。 1. 上下文信息的有效融合:通过SE注意力机制,模块能够在特征融合过程中捕捉并利用重要的上下文信息,从而增强特征表示的有效性,并有效引导模型学习检测目标的信息,从而提高模型的检测精度。 2. 特征增强:通过权重化的特征重组操作,模块能够增强重要特征,同时抑制不重要特征,提升特征图的判别能力。 3. 简单高效:模块结构相对简单,不会引入过多的计算开销,适合在实时目标检测任务中应用。 这期视频讲解在B站:https://www.bilibili.com/video/BV1Vx4y1n7hZ/ 13. ultralytics/cfg/models/11/yolo11-LSDECD.yaml 基于自研轻量化检测头上(LSCD),使用detail-enhanced convolution进一步改进,提高检测头的细节捕获能力,进一步改善检测精度. detect:ultralytics/cfg/models/11/yolo11-LSDECD.yaml segment:ultralytics/cfg/models/11/yolo11-seg-LSDECD.yaml pose:ultralytics/cfg/models/11/yolo11-pose-LSDECD.yaml obb:ultralytics/cfg/models/11/yolo11-obb-LSDECD.yaml 1. DEA-Net中设计了一个细节增强卷积(DEConv),具体来说DEConv将先验信息整合到普通卷积层,以增强表征和泛化能力。然后,通过使用重参数化技术,DEConv等效地转换为普通卷积,不需要额外的参数和计算成本。 14. ultralytics/cfg/models/11/yolo11-C3k2-SMPCGLU.yaml Self-moving Point Convolutional GLU模型改进C3k2. SMP来源于[CVPR2023-SMPConv](https://github.com/sangnekim/SMPConv),Convolutional GLU来源于[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt). 1. 普通的卷积在面对数据中的多样性和复杂性时,可能无法捕捉到有效的特征,因此我们采用了SMPConv,其具备最新的自适应点移动机制,从而更好地捕捉局部特征,提高特征提取的灵活性和准确性。 2. 在SMPConv后添加CGLU,Convolutional GLU 结合了卷积和门控机制,能够选择性地通过信息通道,提高了特征提取的有效性和灵活性。 15. Re-CalibrationFPN 为了加强浅层和深层特征的相互交互能力,推出重校准特征金字塔网络(Re-CalibrationFPN). P2345:ultralytics/cfg/models/11/yolo11-ReCalibrationFPN-P2345.yaml(带有小目标检测头的ReCalibrationFPN) P345:ultralytics/cfg/models/11/yolo11-ReCalibrationFPN-P345.yaml P3456:ultralytics/cfg/models/11/yolo11-ReCalibrationFPN-P3456.yaml(带有大目标检测头的ReCalibrationFPN) 1. 浅层语义较少,但细节丰富,有更明显的边界和减少失真。此外,深层蕴藏着丰富的物质语义信息。因此,直接融合低级具有高级特性的特性可能导致冗余和不一致。为了解决这个问题,我们提出了SBA模块,它有选择地聚合边界信息和语义信息来描绘更细粒度的物体轮廓和重新校准物体的位置。 2. 相比传统的FPN结构,SBA模块引入了高分辨率和低分辨率特征之间的双向融合机制,使得特征之间的信息传递更加充分,进一步提升了多尺度特征融合的效果。 3. SBA模块通过自适应的注意力机制,根据特征图的不同分辨率和内容,自适应地调整特征的权重,从而更好地捕捉目标的多尺度特征。 16. ultralytics/cfg/models/11/yolo11-CSP-PTB.yaml Cross Stage Partial - Partially Transformer Block 在计算机视觉任务中,Transformer结构因其强大的全局特征提取能力而受到广泛关注。然而,由于Transformer结构的计算复杂度较高,直接将其应用于所有通道会导致显著的计算开销。为了在保证高效特征提取的同时降低计算成本,我们设计了一种混合结构,将输入特征图分为两部分,分别由CNN和Transformer处理,结合了卷积神经网络(CNN)和Transformer机制的模块,旨在增强特征提取的能力。 我们提出了一种名为CSP_PTB(Cross Stage Partial - Partially Transformer Block)的模块,旨在结合CNN和Transformer的优势,通过对输入通道进行部分分配来优化计算效率和特征提取能力。 1. 融合局部和全局特征:多项研究表明,CNN的感受野大小较少,导致其只能提取局部特征,但Transformer的MHSA能够提取全局特征,能够同时利用两者的优势。 2. 保证高效特征提取的同时降低计算成本:为了能引入Transformer结构来提取全局特征又不想大幅度增加计算复杂度,因此提出Partially Transformer Block,只对部分通道使用TransformerBlock。 3. MHSA_CGLU包含Mutil-Head-Self-Attention和[ConvolutionalGLU(TransNext CVPR2024)](https://github.com/DaiShiResearch/TransNeXt),其中Mutil-Head-Self-Attention负责提取全局特征,ConvolutionalGLU用于增强非线性特征表达能力,ConvolutionalGLU相比于传统的FFN,具有更强的性能。 4. 可以根据不同的模型大小和具体的运行情况调节用于Transformer的通道数。 17. ultralytics/cfg/models/11/yolo11-SOEP.yaml 小目标在正常的P3、P4、P5检测层上略显吃力,比较传统的做法是加上P2检测层来提升小目标的检测能力,但是同时也会带来一系列的问题,例如加上P2检测层后计算量过大、后处理更加耗时等问题,日益激发需要开发新的针对小目标有效的特征金字塔,我们基于原本的PAFPN上进行改进,提出SmallObjectEnhancePyramid,相对于传统的添加P2检测层,我们使用P2特征层经过SPDConv得到富含小目标信息的特征给到P3进行融合,然后使用CSP思想和基于[AAAI2024的OmniKernel](https://ojs.aaai.org/index.php/AAAI/article/view/27907)进行改进得到CSP-OmniKernel进行特征整合,OmniKernel模块由三个分支组成,即三个分支,即全局分支、大分支和局部分支、以有效地学习从全局到局部的特征表征,最终从而提高小目标的检测性能。(该模块需要在train.py中关闭amp、且在ultralytics/engine/validator.py 115行附近的self.args.half设置为False、跑其余改进记得修改回去!) 出现这个报错的:RuntimeError: cuFFT error: CUFFT_INTERNAL_ERROR,如果你是40系显卡,需要更新torch大于2.0,并且cuda大于12.0. 18. ultralytics/cfg/models/11/yolo11-CGRFPN.yaml Context-Guided Spatial Feature Reconstruction Feature Pyramid Network. 1. 借鉴[ECCV2024-CGRSeg](https://github.com/nizhenliang/CGRSeg)中的Rectangular Self-Calibration Module经过精心设计,用于空间特征重建和金字塔上下文提取,它在水平和垂直方向上捕获全局上下文,并获得轴向全局上下文来显式地建模矩形关键区域. 2. PyramidContextExtraction Module使用金字塔上下文提取模块(PyramidContextExtraction),有效整合不同层级的特征信息,提升模型的上下文感知能力。 3. FuseBlockMulti 和 DynamicInterpolationFusion 这些模块用于多尺度特征的融合,通过动态插值和多特征融合,进一步提高了模型的多尺度特征表示能力和提升模型对复杂背景下目标的识别能力。 19. ultralytics/cfg/models/11/yolo11-FeaturePyramidSharedConv.yaml 1. 多尺度特征提取 通过使用不同膨胀率的卷积层,模块能够提取不同尺度的特征。这对捕捉图像中不同大小和不同上下文的信息非常有利。 低膨胀率捕捉局部细节,高膨胀率捕捉全局上下文。 2. 参数共享 使用共享的卷积层 self.share_conv,大大减少了需要训练的参数数量。相比于每个膨胀率使用独立的卷积层,共享卷积层能够减少冗余,提升模型效率。 减少了模型的存储和计算开销,提升了计算效率。 3. 高效的通道变换 通过1x1卷积层 self.cv1 和 self.cv2,模块能够高效地调整通道数,并进行特征融合。1x1卷积层在减少参数量的同时还能保留重要的特征信息。 4. 更细粒度的特征提取 FeaturePyramidSharedConv 使用卷积操作进行特征提取,能够捕捉更加细粒度的特征。相比之下,SPPF 的池化操作可能会丢失一些细节信息。 卷积操作在特征提取时具有更高的灵活性和表达能力,可以更好地捕捉图像中的细节和复杂模式。 20. APT(Adaptive Power Transformation)-TAL. 为了使不同gt预测对的匹配质量和损失权重更具鉴别性,我们通过自定义的PowerTransformer显著增强高质量预测框的权重,抑制低质量预测框的影响,并使模型在学习的过程可以更关注质量高的预测框。 21. ultralytics/cfg/models/11/yolo11-EMBSFPN.yaml 基于BIFPN、[MAF-YOLO](https://arxiv.org/pdf/2407.04381)、[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)提出全新的Efficient Multi-Branch&Scale FPN. Efficient Multi-Branch&Scale FPN拥有<轻量化>、<多尺度特征加权融合>、<多尺度高效卷积模块>、<高效上采样模块>、<全局异构核选择机制>。 1. 具有多尺度高效卷积模块和全局异构核选择机制,Trident网络的研究表明,具有较大感受野的网络更适合检测较大的物体,反之,较小尺度的目标则从较小的感受野中受益,因此我们在FPN阶段,对于不同尺度的特征层选择不同的多尺度卷积核以适应并逐步获得多尺度感知场信息。 2. 借鉴BIFPN中的多尺度特征加权融合,能把Concat换成Add来减少参数量和计算量的情况下,还能通过不同尺度特征的重要性进行自适用选择加权融合。 3. 高效上采样模块来源于CVPR2024-EMCAD中的EUCB,能够在保证一定效果的同时保持高效性。 22. ultralytics/cfg/models/11/yolo11-CSP-PMSFA.yaml 自研模块:CSP-Partial Multi-Scale Feature Aggregation. 1. 部分多尺度特征提取:参考CVPR2020-GhostNet、CVPR2024-FasterNet的思想,采用高效的PartialConv,该模块能够从输入中提取多种尺度的特征信息,但它并不是在所有通道上进行这种操作,而是部分(Partial)地进行,从而提高了计算效率。 2. 增强的特征融合: 最后的 1x1 卷积层通过将不同尺度的特征融合在一起,同时使用残差连接将输入特征与处理后的特征相加,有效保留了原始信息并引入了新的多尺度信息,从而提高模型的表达能力。 23. ultralytics/cfg/models/11/yolo11-MutilBackbone-DAF.yaml 自研MutilBackbone-DynamicAlignFusion. 1. 为了避免在浅层特征图上消耗过多计算资源,设计的MutilBackbone共享一个stem的信息,这个设计有利于避免计算量过大,推理时间过大的问题。 2. 为了避免不同Backbone信息融合出现不同来源特征之间的空间差异,我们为此设计了DynamicAlignFusion,其先通过融合来自两个不同模块学习到的特征,然后生成一个名为DynamicAlignWeight去调整各自的特征,最后使用一个可学习的通道权重,其可以根据输入特征动态调整两条路径的权重,从而增强模型对不同特征的适应能力。 24. ultralytics/cfg/models/11/yolo11-C3k2-MutilScaleEdgeInformationEnhance.yaml 自研CSP-MutilScaleEdgeInformationEnhance. MutilScaleEdgeInformationEnhance模块结合了多尺度特征提取、边缘信息增强和卷积操作。它的主要目的是从不同尺度上提取特征,突出边缘信息,并将这些多尺度特征整合到一起,最后通过卷积层输出增强的特征。这个模块在特征提取和边缘增强的基础上有很好的表征能力. 1. 多尺度特征提取:通过 nn.AdaptiveAvgPool2d 进行多尺度的池化,提取不同大小的局部信息,有助于捕捉图像的多层次特征。 2. 边缘增强:EdgeEnhancer 模块专门用于提取边缘信息,使得网络对边缘的敏感度增强,这对许多视觉任务(如目标检测、语义分割等)有重要作用。 3. 特征融合:将不同尺度下提取的特征通过插值操作对齐到同一尺度,然后将它们拼接在一起,最后经过卷积层融合成统一的特征表示,能够提高模型对多尺度特征的感知。 25. ultralytics/cfg/models/11/yolo11-CSP-FreqSpatial.yaml FreqSpatial 是一个融合时域和频域特征的卷积神经网络(CNN)模块。该模块通过在时域和频域中提取特征,旨在捕捉不同层次的空间和频率信息,以增强模型在处理图像数据时的鲁棒性和表示能力。模块的主要特点是将 Scharr 算子(用于边缘检测)与 时域卷积 和 频域卷积 结合,通过多种视角捕获图像的结构特征。 1. 时域特征提取:从原始图像中提取出基于空间结构的特征,主要捕捉图像的细节、边缘信息等。 2. 频域特征提取:从频率域中提取出频率相关的模式,捕捉到图像的低频和高频成分,能够帮助模型在全局和局部的尺度上提取信息。 3. 特征融合:将时域和频域的特征进行加权相加,得到最终的输出特征图。这种加权融合允许模型同时考虑空间结构信息和频率信息,从而增强模型在多种场景下的表现能力。 26. ultralytics/cfg/models/11/yolo11-C3k2-MutilScaleEdgeInformationSelect.yaml 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新. 我们提出了一个 多尺度边缘信息选择模块(MutilScaleEdgeInformationSelect),其目的是从多尺度边缘信息中高效选择与目标任务高度相关的关键特征。为了实现这一目标,我们引入了一个具有通过聚焦更重要的区域能力的注意力机制[ICCV2023 DualDomainSelectionMechanism, DSM](https://github.com/c-yn/FocalNet)。该机制通过聚焦图像中更重要的区域(如复杂边缘和高频信号区域),在多尺度特征中自适应地筛选具有更高任务相关性的特征,从而显著提升了特征选择的精准度和整体模型性能。 27. GlobalEdgeInformationTransfer 实现版本1:ultralytics/cfg/models/11/yolo11-GlobalEdgeInformationTransfer1.yaml 实现版本2:ultralytics/cfg/models/11/yolo11-GlobalEdgeInformationTransfer2.yaml 实现版本3:ultralytics/cfg/models/11/yolo11-GlobalEdgeInformationTransfer3.yaml 总所周知,物体框的定位非常之依赖物体的边缘信息,但是对于常规的目标检测网络来说,没有任何组件能提高网络对物体边缘信息的关注度,我们需要开发一个能让边缘信息融合到各个尺度所提取的特征中,因此我们提出一个名为GlobalEdgeInformationTransfer(GEIT)的模块,其可以帮助我们把浅层特征中提取到的边缘信息传递到整个backbone上,并与不同尺度的特征进行融合。 1. 由于原始图像中含有大量背景信息,因此从原始图像上直接提取边缘信息传递到整个backbone上会给网络的学习带来噪声,而且浅层的卷积层会帮助我们过滤不必要的背景信息,因此我们选择在网络的浅层开发一个名为MutilScaleEdgeInfoGenetator的模块,其会利用网络的浅层特征层去生成多个尺度的边缘信息特征图并投放到主干的各个尺度中进行融合。 2. 对于下采样方面的选择,我们需要较为谨慎,我们的目标是保留并增强边缘信息,同时进行下采样,选择MaxPool 会更合适。它能够保留局部区域的最强特征,更好地体现边缘信息。因为 AvgPool 更适用于需要平滑或均匀化特征的场景,但在保留细节和边缘信息方面的表现不如 MaxPool。 3. 对于融合部分,ConvEdgeFusion巧妙地结合边缘信息和普通卷积特征,提出了一种新的跨通道特征融合方式。首先,使用conv_channel_fusion进行边缘信息与普通卷积特征的跨通道融合,帮助模型更好地整合不同来源的特征。然后采用conv_3x3_feature_extract进一步提取融合后的特征,以增强模型对局部细节的捕捉能力。最后通过conv_1x1调整输出特征维度。 28. ultralytics/cfg/models/11/yolo11-C3k2-DIMB.yaml 自研模块DynamicInceptionDWConv2d.(更详细点说明看项目的配置文件.md) 29. ultralytics/cfg/models/11/yolo11-HAFB-1.yaml 自研模块Hierarchical Attention Fusion Block, HAFB.(更详细点说明看项目的配置文件.md) 30. ultralytics/cfg/models/11/yolo11-HAFB-2.yaml 自研模块Hierarchical Attention Fusion Block, HAFB.(更详细点说明看项目的配置文件.md) 31. ultralytics/cfg/models/11/yolo11-MutilBackbone-HAFB.yaml 在yolo11-MutilBackbone-DAF.yaml的自研创新上引入HAFB. ### BackBone系列 1. ultralytics/cfg/models/11/yolo11-efficientViT.yaml (CVPR2023)efficientViT替换yolo11主干. 2. ultralytics/cfg/models/11/yolo11-fasternet.yaml (CVPR2023)fasternet替换yolo11主干. 3. ultralytics/cfg/models/11/yolo11-timm.yaml 使用timm支持的主干网络替换yolo11主干. 4. ultralytics/cfg/models/11/yolo11-convnextv2.yaml 使用convnextv2网络替换yolo11主干. 5. ultralytics/cfg/models/11/yolo11-EfficientFormerV2.yaml 使用EfficientFormerV2网络替换yolo11主干.(需要看[常见错误和解决方案的第五点](#a)) 6. ultralytics/cfg/models/11/yolo11-vanillanet.yaml vanillanet替换yolo11主干. 7. ultralytics/cfg/models/11/yolo11-LSKNet.yaml LSKNet(2023旋转目标检测SOTA的主干)替换yolo11主干. 8. ultralytics/cfg/models/11/yolo11-swintransformer.yaml SwinTransformer-Tiny替换yolo11主干. 9. ultralytics/cfg/models/11/yolo11-repvit.yaml [RepViT](https://github.com/THU-MIG/RepViT/tree/main)替换yolo11主干. 10. ultralytics/cfg/models/11/yolo11-CSwinTransformer.yaml 使用[CSWin-Transformer(CVPR2022)](https://github.com/microsoft/CSWin-Transformer/tree/main)替换yolo11主干.(需要看[常见错误和解决方案的第五点](#a)) 11. ultralytics/cfg/models/11/yolo11-HGNetV2.yaml 使用HGNetV2作为YOLO11的backbone. 12. ultralytics/cfg/models/11/yolo11-unireplknet.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)替换yolo11主干. 13. ultralytics/cfg/models/11/yolo11-TransNeXt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)改进yolo11的backbone.(需要看[常见错误和解决方案的第五点](#a)) 14. ultralytics/cfg/models/rt-detr/yolo11-rmt.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)改进rtdetr的主干. 15. ultralytics/cfg/models/11/yolo11-pkinet.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)改进backbone.(需要安装mmcv和mmengine) 16. ultralytics/cfg/models/11/yolo11-mobilenetv4.yaml 使用[MobileNetV4](https://github.com/jaiwei98/MobileNetV4-pytorch/tree/main)改进yolo11-backbone. 17. ultralytics/cfg/models/11/yolo11-starnet.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)改进yolo11-backbone. 18. ultralytics/cfg/models/11/yolo11-inceptionnext.yaml 使用[InceptionNeXt CVPR2024](https://github.com/sail-sg/inceptionnext)替换backbone. 19. ultralytics/cfg/models/11/yolo11-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOut替换BackBone. 20. ultralytics/cfg/models/11/yolo11-MobileMamba.yaml 使用[CVPR2025 MobileMamba](https://github.com/lewandofskee/MobileMamba)中的MobileMamba改进Backbone. 21. ultralytics/cfg/models/11/yolo11-overlock.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的overlock-backbone替换backbone. 22. ultralytics/cfg/models/11/yolo11-lsnet.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSNet替换yolo11-backbone. 23. ultralytics/cfg/models/11/yolo11-ESMoE.yaml 使用[YOLO-Master](https://github.com/isLinXu/YOLO-Master)中的ES-MoE模块改进Yolo11. 24. ultralytics/cfg/models/11/yolo11-FAENet.yaml 使用[TGRS2025 MASFNet](https://ieeexplore.ieee.org/document/10955257)中的FAENet增强输入图像的特征. ### SPPF系列 1. ultralytics/cfg/models/11/yolo11-FocalModulation.yaml 使用[Focal Modulation](https://github.com/microsoft/FocalNet)替换SPPF. 2. ultralytics/cfg/models/11/yolo11-SPPF-LSKA.yaml 使用[LSKA](https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention)注意力机制改进SPPF,增强多尺度特征提取能力. 3. ultralytics/cfg/models/11/yolo11-AIFI.yaml 使用[RT-DETR](https://arxiv.org/pdf/2304.08069.pdf)中的Attention-based Intrascale Feature Interaction(AIFI)改进yolo11. 4. ultralytics/cfg/models/11/yolo11-AIFIRepBN.yaml 使用[ICML-2024 SLAB](https://github.com/xinghaochen/SLAB)中的RepBN改进AIFI. ### Neck系列 1. ultralytics/cfg/models/11/yolo11-bifpn.yaml 添加BIFPN到yolo11中. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode 支持大部分C3k2-XXX结构. 3. head_channel BIFPN中的通道数,默认设置为256. 2. ultralytics/cfg/models/11/yolo11-slimneck.yaml 使用VoVGSCSP\VoVGSCSPC和GSConv替换yolo11 neck中的C3k2和Conv. 3. Asymptotic Feature Pyramid Network[reference](https://github.com/gyyang23/AFPN/tree/master) a. ultralytics/cfg/models/11/yolo11-AFPN-P345.yaml b. ultralytics/cfg/models/11/yolo11-AFPN-P345-Custom.yaml c. ultralytics/cfg/models/11/yolo11-AFPN-P2345.yaml d. ultralytics/cfg/models/11/yolo11-AFPN-P2345-Custom.yaml 其中Custom中的block支持大部分C3k2-XXX结构. 4. ultralytics/cfg/models/11/yolo11-RCSOSA.yaml 使用[RCS-YOLO](https://github.com/mkang315/RCS-YOLO/tree/main)中的RCSOSA替换C3k2. 5. ultralytics/cfg/models/11/yolo11-goldyolo.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块 6. ultralytics/cfg/models/11/yolo11-GFPN.yaml 使用[DAMO-YOLO](https://github.com/tinyvision/DAMO-YOLO)中的RepGFPN改进Neck. 7. ultralytics/cfg/models/11/yolo11-EfficientRepBiPAN.yaml 使用[YOLOV6](https://github.com/meituan/YOLOv6/tree/main)中的EfficientRepBiPAN改进Neck. 8. ultralytics/cfg/models/11/yolo11-ASF.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion改进yolo11. 9. ultralytics/cfg/models/11/yolo11-SDI.yaml 使用[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对yolo11中的feature fusion部分进行重设计. 10. ultralytics/cfg/models/11/yolo11-HSFPN.yaml 使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进yolo11的neck. 11. ultralytics/cfg/models/11/yolo11-CSFCN.yaml 使用[Context and Spatial Feature Calibration for Real-Time Semantic Segmentation](https://github.com/kaigelee/CSFCN/tree/main)中的Context and Spatial Feature Calibration模块改进yolo11. 12. ultralytics/cfg/models/11/yolo11-CGAFusion.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的content-guided attention fusion改进yolo11-neck. 13. ultralytics/cfg/models/11/yolo11-SDFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的superficial detail fusion module改进yolo11-neck. 14. ultralytics/cfg/models/11/yolo11-PSFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的profound semantic fusion module改进yolo11-neck. 15. ultralytics/cfg/models/11/yolo11-GLSA.yaml 使用[GLSA](https://github.com/Barrett-python/DuAT)模块改进yolo11的neck. 16. ultralytics/cfg/models/11/yolo11-CTrans.yaml 使用[[AAAI2022] UCTransNet](https://github.com/McGregorWwww/UCTransNet/tree/main)中的ChannelTransformer改进yolo11-neck.(需要看[常见错误和解决方案的第五点](#a)) 17. ultralytics/cfg/models/11/yolo11-p6-CTrans.yaml 使用[[AAAI2022] UCTransNet](https://github.com/McGregorWwww/UCTransNet/tree/main)中的ChannelTransformer改进yolo11-neck.(带有p6版本)(需要看[常见错误和解决方案的第五点](#a)) 18. ultralytics/cfg/models/11/yolo11-MAFPN.yaml 使用[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN改进Neck. 19. ultralytics/cfg/models/11/yolo11-hyper.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的Hypergraph Computation in Semantic Space改进yolov11. 20. ultralytics/cfg/models/11/yolo11-msga.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate改进yolo11-neck. 21. ultralytics/cfg/models/11/yolo11-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade改进yolo11-neck. 22. ultralytics/cfg/models/11/yolo11-mpcafsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention改进yolo11-neck. 23. ultralytics/cfg/models/11/yolo11-fsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention改进yolo11. 24. ultralytics/cfg/models/11/yolo11-GDSAFusion.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的GDSAFusion改进neck. 25. ultralytics/cfg/models/11/yolo11-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM改进neck. 26. ultralytics/cfg/models/11/yolo11-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE改进YOLO11-neck. 27. ultralytics/cfg/models/11/yolo11-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer改进yolo11-neck. 28. ultralytics/cfg/models/11/yolo11-HS-FPN.yaml 使用[AAAI2025 HS-FPN](https://github.com/ShiZican/HS-FPN/tree/main)中的HFP和SDP改进yolo11-neck. 29. ultralytics/cfg/models/11/yolo11-MSAM.yaml 使用[TGRS2025 UMFormer](https://github.com/takeyoutime/UMFormer)中的MSAM和yolo13的扩散机制改进yolo11-neck. 30. ultralytics/cfg/models/11/yolo11-DPCF.yaml 使用[INFFUS2025 SAMamba](https://arxiv.org/pdf/2505.23214)中的DPCF改进neck. 31. ultralytics/cfg/models/11/yolo11-LCA.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的LCA改进yolo11-neck. 32. ultralytics/cfg/models/11/yolo11-HFFE.yaml 使用[TGRS2025 HAFNet](https://ieeexplore.ieee.org/document/11154006)中的HFFE改进yolo11-neck. 33. ultralytics/cfg/models/11/yolo11-MFPM.yaml 使用[TGRS2025 ISGLNet](https://ieeexplore.ieee.org/document/11232501)中的MFPM改进特征融合. 34. ultralytics/cfg/models/11/yolo11-ERM.yaml 使用[TGRS2025 ISGLNet](https://ieeexplore.ieee.org/document/11232501)中的ERM改进特征融合. 35. ultralytics/cfg/models/11/yolo11-CAFM.yaml 使用[TIP2025 DSMT](https://ieeexplore.ieee.org/document/10955125)中的CAFM改进yolo11-neck. ### Head系列 1. ultralytics/cfg/models/11/yolo11-dyhead.yaml 添加基于注意力机制的目标检测头到yolo11中. 2. ultralytics/cfg/models/11/yolo11-EfficientHead.yaml 对检测头进行重设计,支持2种轻量化检测头.详细请看ultralytics/nn/extra_modules/head.py中的Detect_Efficient class. 3. ultralytics/cfg/models/11/yolo11-aux.yaml 参考YOLOV7-Aux对YOLO11添加额外辅助训练头,在训练阶段参与训练,在最终推理阶段去掉. 其中辅助训练头的损失权重系数可在ultralytics/utils/loss.py中的class v8DetectionLoss中的__init__函数中的self.aux_loss_ratio设定,默认值参考yolov7为0.25. 4. ultralytics/cfg/models/11/yolo11-seg-EfficientHead.yaml(实例分割) 对检测头进行重设计,支持2种轻量化检测头.详细请看ultralytics/nn/extra_modules/head.py中的Detect_Efficient class. 5. ultralytics/cfg/models/11/yolo11-SEAMHead.yaml 使用[YOLO-Face V2](https://arxiv.org/pdf/2208.02019v2.pdf)中的遮挡感知注意力改进Head,使其有效地处理遮挡场景. 6. ultralytics/cfg/models/11/yolo11-MultiSEAMHead.yaml 使用[YOLO-Face V2](https://arxiv.org/pdf/2208.02019v2.pdf)中的遮挡感知注意力改进Head,使其有效地处理遮挡场景. 7. ultralytics/cfg/models/11/yolo11-PGI.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)的programmable gradient information改进YOLO11.(PGI模块可在训练结束后去掉) 8. Lightweight Asymmetric Detection Head detect:ultralytics/cfg/models/11/yolo11-LADH.yaml segment:ultralytics/cfg/models/11/yolo11-seg-LADH.yaml pose:ultralytics/cfg/models/11/yolo11-pose-LADH.yaml obb:ultralytics/cfg/models/11/yolo11-obb-LADH.yaml 使用[Faster and Lightweight: An Improved YOLOv5 Object Detector for Remote Sensing Images](https://www.mdpi.com/2072-4292/15/20/4974)中的Lightweight Asymmetric Detection Head改进yolo11-head. 9. ultralytics/cfg/models/11/yolo11-atthead.yaml B站注意力教程例子.链接:https://www.bilibili.com/video/BV1mXkVYAEGM/ 10. Localization Quality Estimation Head 此模块出自[GFocalV2](https://arxiv.org/abs/2011.12885). detect:ultralytics/cfg/models/11/yolo11-LQEHead.yaml segmet:ultralytics/cfg/models/11/yolo11-seg-LQE.yaml pose:ultralytics/cfg/models/11/yolo11-pose-LQE.yaml obb:ultralytics/cfg/models/11/yolo11-obb-LQE.yaml ### Label Assign系列 1. Adaptive Training Sample Selection匹配策略. 在ultralytics/utils/loss.py中的class v8DetectionLoss中自行选择对应的self.assigner即可. ### PostProcess系列 1. soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU,ShapeIoU) soft-nms替换nms.(建议:仅在val.py时候使用,具体替换请看20240122版本更新说明) 2. ultralytics/cfg/models/11/yolo11-nmsfree.yaml 仿照yolov10的思想采用双重标签分配和一致匹配度量进行训练,后处理不需要NMS! ### 上下采样算子 1. ultralytics/cfg/models/11/yolo11-ContextGuidedDown.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided DownSample进行下采样. 2. ultralytics/cfg/models/11/yolo11-SPDConv.yaml 使用[SPDConv](https://github.com/LabSAINT/SPD-Conv/tree/main)进行下采样. 3. ultralytics/cfg/models/11/yolo11-dysample.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进yolo11-neck中的上采样. 4. ultralytics/cfg/models/11/yolo11-CARAFE.yaml 使用[ICCV2019 CARAFE](https://arxiv.org/abs/1905.02188)改进yolo11-neck中的上采样. 5. ultralytics/cfg/models/11/yolo11-HWD.yaml 使用[Haar wavelet downsampling](https://www.sciencedirect.com/science/article/abs/pii/S0031320323005174)改进yolo11的下采样.(请关闭AMP情况下使用) 6. ultralytics/cfg/models/11/yolo11-v7DS.yaml 使用[YOLOV7 CVPR2023](https://arxiv.org/abs/2207.02696)的下采样结构改进YOLO11中的下采样. 7. ultralytics/cfg/models/11/yolo11-ADown.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)的下采样结构改进YOLO11中的下采样. 8. ultralytics/cfg/models/11/yolo11-SRFD.yaml 使用[A Robust Feature Downsampling Module for Remote Sensing Visual Tasks](https://ieeexplore.ieee.org/document/10142024)改进yolo11的下采样. 9. ultralytics/cfg/models/11/yolo11-WaveletPool.yaml 使用[Wavelet Pooling](https://openreview.net/forum?id=rkhlb8lCZ)改进YOLO11的上采样和下采样。 10. ultralytics/cfg/models/11/yolo11-LDConv.yaml 使用[LDConv](https://github.com/CV-ZhangXin/LDConv/tree/main)改进下采样. 11. ultralytics/cfg/models/11/yolo11-PSConv.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Pinwheel-shaped Convolution改进yolo11. 12. ultralytics/cfg/models/11/yolo11-EUCB.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB改进yolo11的上采样. 13. ultralytics/cfg/models/11/yolo11-LoGStem.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LoGStem改进Stem(第一第二层卷积). 14. ultralytics/cfg/models/11/yolo11-wConv.yaml 使用[weightedConvolution2.0](https://github.com/cammarasana123/weightedConvolution2.0)中的wConv2d改进yolo11. 15. ultralytics/cfg/models/11/yolo11-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进Conv. 16. ultralytics/cfg/models/11/yolo11-Converse2D.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的Converse2D改进neck中的上采样. 17. ultralytics/cfg/models/11/yolo11-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进下采样. 18. ultralytics/cfg/models/11/yolo11-RepStem.yaml 使用[ICCV2023 FastVit](https://arxiv.org/pdf/2303.14189)中的RepStem改进yolo11下采样. 19. ultralytics/cfg/models/11/yolo11-FSConv.yaml 使用[TGRS2025 Think Locally and Act Globally](https://ieeexplore.ieee.org/document/11175146)中的FSConv改进下采样. ### YOLO11-C3k2系列 1. ultralytics/cfg/models/11/yolo11-C3k2-Faster.yaml 使用C3k2-Faster替换C3k2.(使用FasterNet中的FasterBlock替换C3k2中的Bottleneck) 2. ultralytics/cfg/models/11/yolo11-C3k2-ODConv.yaml 使用C3k2-ODConv替换C3k2.(使用ODConv替换C3k2中的Bottleneck中的Conv) 3. ultralytics/cfg/models/11/yolo11-C3k2-ODConv.yaml 使用C3k2-ODConv替换C3k2.(使用ODConv替换C3k2中的Bottleneck中的Conv) 4. ultralytics/cfg/models/11/yolo11-C3k2-Faster-EMA.yaml 使用C3k2-Faster-EMA替换C3k2.(C3k2-Faster-EMA推荐可以放在主干上,Neck和head部分可以选择C3k2-Faster) 5. ultralytics/cfg/models/11/yolo11-C3k2-DBB.yaml 使用C3k2-DBB替换C3k2.(使用DiverseBranchBlock替换C3k2中的Bottleneck中的Conv) 6. ultralytics/cfg/models/11/yolo11-C3k2-CloAtt.yaml 使用C3k2-CloAtt替换C3k2.(使用CloFormer中的具有全局和局部特征的注意力机制添加到C3k2中的Bottleneck中)(需要看[常见错误和解决方案的第五点](#a)) 7. ultralytics/cfg/models/11/yolo11-C3k2-SCConv.yaml SCConv(CVPR2020 http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf)与C3k2融合. 8. ultralytics/cfg/models/11/yolo11-C3k2-SCcConv.yaml ScConv(CVPR2023 https://openaccess.thecvf.com/content/CVPR2023/papers/Li_SCConv_Spatial_and_Channel_Reconstruction_Convolution_for_Feature_Redundancy_CVPR_2023_paper.pdf)与C3k2融合. (取名为SCcConv的原因是在windows下命名是不区分大小写的) 9. ultralytics/cfg/models/11/yolo11-KernelWarehouse.yaml 使用[Towards Parameter-Efficient Dynamic Convolution](https://github.com/OSVAI/KernelWarehouse)添加到yolo11中. 使用此模块需要注意,在epoch0-20的时候精度会非常低,过了20epoch会正常. 10. ultralytics/cfg/models/11/yolo11-C3k2-DySnakeConv.yaml [DySnakeConv](https://github.com/YaoleiQi/DSCNet)与C3k2融合. 11. ultralytics/cfg/models/11/yolo11-C3k2-DCNV2.yaml 使用C3k2-DCNV2替换C3k2.(DCNV2为可变形卷积V2) 12. ultralytics/cfg/models/11/yolo11-C3k2-DCNV3.yaml 使用C3k2-DCNV3替换C3k2.([DCNV3](https://github.com/OpenGVLab/InternImage)为可变形卷积V3(CVPR2023,众多排行榜的SOTA)) 官方中包含了一些指定版本的DCNV3 whl包,下载后直接pip install xxx即可.具体和安装DCNV3可看百度云链接中的视频. 13. ultralytics/cfg/models/11/yolo11-C3k2-OREPA.yaml 使用C3k2-OREPA替换C3k2.[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main) 14. ultralytics/cfg/models/11/yolo11-C3k2-REPVGGOREPA.yaml 使用C3k2-REPVGGOREPA替换C3k2.[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main) 15. ultralytics/cfg/models/11/yolo11-C3k2-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)改进C3k2.(请关闭AMP进行训练,使用教程请看20240116版本更新说明) 16. ultralytics/cfg/models/11/yolo11-C3k2-ContextGuided.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided改进C3k2. 17. ultralytics/cfg/models/11/yolo11-C3k2-MSBlock.yaml 使用[YOLO-MS](https://github.com/FishAndWasabi/YOLO-MS/tree/main)中的MSBlock改进C3k2. 18. ultralytics/cfg/models/11/yolo11-C3k2-DLKA.yaml 使用[deformableLKA](https://github.com/xmindflow/deformableLKA)改进C3k2. 19. ultralytics/cfg/models/11/yolo11-C3k2-DAttention.yaml 使用[Vision Transformer with Deformable Attention(CVPR2022)](https://github.com/LeapLabTHU/DAT)改进C3k2.(需要看[常见错误和解决方案的第五点](#a)) 使用注意点请看百度云视频.(DAttention(Vision Transformer with Deformable Attention CVPR2022)使用注意说明.) 20. 使用[ParC-Net](https://github.com/hkzhang-git/ParC-Net/tree/main)中的ParC_Operator改进C3k2.(需要看[常见错误和解决方案的第五点](#a)) 使用注意点请看百度云视频.(20231031更新说明) 21. ultralytics/cfg/models/11/yolo11-C3k2-DWR.yaml 使用[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)模块,加强从网络高层的可扩展感受野中提取特征. 22. ultralytics/cfg/models/11/yolo11-C3k2-RFAConv.yaml 使用[RFAConv](https://github.com/Liuchen1997/RFAConv/tree/main)中的RFAConv改进yolo11. 23. ultralytics/cfg/models/11/yolo11-C3k2-RFCBAMConv.yaml 使用[RFAConv](https://github.com/Liuchen1997/RFAConv/tree/main)中的RFCBAMConv改进yolo11. 24. ultralytics/cfg/models/11/yolo11-C3k2-RFCAConv.yaml 使用[RFAConv](https://github.com/Liuchen1997/RFAConv/tree/main)中的RFCAConv改进yolo11. 25. ultralytics/cfg/models/11/yolo11-C3k2-FocusedLinearAttention.yaml 使用[FLatten Transformer(ICCV2023)](https://github.com/LeapLabTHU/FLatten-Transformer)中的FocusedLinearAttention改进C3k2.(需要看[常见错误和解决方案的第五点](#a)) 使用注意点请看百度云视频.(20231114版本更新说明.) 26. ultralytics/cfg/models/11/yolo11-C3k2-MLCA.yaml 使用[Mixed Local Channel Attention 2023](https://github.com/wandahangFY/MLCA/tree/master)改进C3k2.(用法请看百度云视频-20231129版本更新说明) 27. ultralytics/cfg/models/11/yolo11-C3k2-AKConv.yaml 使用[AKConv 2023](https://github.com/CV-ZhangXin/AKConv)改进C3k2.(用法请看百度云视频-20231129版本更新说明) 28. ultralytics/cfg/models/11/yolo11-C3k2-UniRepLKNetBlock.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的UniRepLKNetBlock改进C3k2. 29. ultralytics/cfg/models/11/yolo11-C3k2-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock改进C3k2. 30. ultralytics/cfg/models/11/yolo11-C3k2-AggregatedAtt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)中的聚合感知注意力改进C3k2.(需要看[常见错误和解决方案的第五点](#a)) 31. ultralytics/cfg/models/11/yolo11-C3k2-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)改进yolo11中的C3k2. 32. ultralytics/cfg/models/11/yolo11-C3k2-iRMB.yaml 使用[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB改进C3k2. 33. ultralytics/cfg/models/11/yolo11-C3k2-VSS.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)对C3k2中的BottleNeck进行改进,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 34. ultralytics/cfg/models/11/yolo11-C3k2-LVMB.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)与Cross Stage Partial进行结合,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 35. ultralytics/cfg/models/11/yolo11-RepNCSPELAN.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行改进yolo11. 36. ultralytics/cfg/models/11/yolo11-C3k2-DynamicConv.yaml 使用[CVPR2024 parameternet](https://arxiv.org/pdf/2306.14525v2.pdf)中的DynamicConv改进C3k2. 37. ultralytics/cfg/models/11/yolo11-C3k2-GhostDynamicConv.yaml 使用[CVPR2024 parameternet](https://arxiv.org/pdf/2306.14525v2.pdf)中的GhostModule改进C3k2. 38. ultralytics/cfg/models/11/yolo11-C3k2-RVB.yaml 使用[CVPR2024 RepViT](https://github.com/THU-MIG/RepViT/tree/main)中的RepViTBlock改进C3k2. 39. ultralytics/cfg/models/11/yolo11-DGCST.yaml 使用[Lightweight Object Detection](https://arxiv.org/abs/2403.01736)中的Dynamic Group Convolution Shuffle Transformer改进yolo11. 40. ultralytics/cfg/models/11/yolo11-C3k2-RetBlock.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)中的RetBlock改进C3k2. 41. ultralytics/cfg/models/11/yolo11-C3k2-PKI.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的PKIModule和CAA模块改进C3k2. 42. ultralytics/cfg/models/11/yolo11-RepNCSPELAN_CAA.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块改进RepNCSPELAN. 43. ultralytics/cfg/models/11/yolo11-C3k2-fadc.yaml 使用[CVPR2024 Frequency-Adaptive Dilated Convolution](https://github.com/Linwei-Chen/FADC)改进C3k2. 44. ultralytics/cfg/models/11/yolo11-C3k2-PPA.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Parallelized Patch-Aware Attention Module改进C3k2. 45. ultralytics/cfg/models/11/yolo11-C3k2-Star.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock改进C3k2. 46. ultralytics/cfg/models/11/yolo11-C3k2-KAN.yaml KAN In! Mamba Out! Kolmogorov-Arnold Networks. 目前支持: 1. FastKANConv2DLayer 2. KANConv2DLayer 3. KALNConv2DLayer 4. KACNConv2DLayer 5. KAGNConv2DLayer 47. ultralytics/cfg/models/11/yolo11-C3k2-DEConv.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的detail-enhanced convolution改进C3k2. 48. ultralytics/cfg/models/11/yolo11-C3k2-Heat.yaml 使用[vHeat](https://github.com/MzeroMiko/vHeat/tree/main)中的HeatBlock改进C3k2. 49. ultralytics/cfg/models/11/yolo11-C3k2-WTConv.yaml 使用[ECCV2024 Wavelet Convolutions for Large Receptive Fields](https://github.com/BGU-CS-VIL/WTConv)中的WTConv改进C3k2-BottleNeck. 50. ultralytics/cfg/models/11/yolo11-C3k2-FMB.yaml 使用[ECCV2024 SMFANet](https://github.com/Zheng-MJ/SMFANet/tree/main)的Feature Modulation block改进C3k2. 51. ultralytics/cfg/models/11/yolo11-C3k2-gConv.yaml 使用[Rethinking Performance Gains in Image Dehazing Networks](https://arxiv.org/abs/2209.11448)的gConvblock改进C3k2. 52. ultralytics/cfg/models/11/yolo11-C3k2-WDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的WDBB改进C3k2. 53. ultralytics/cfg/models/11/yolo11-C3k2-DeepDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的DeepDBB改进C3k2. 54. ultralytics/cfg/models/11/yolo11-C3k2-AdditiveBlock.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock改进C3k2. 55. ultralytics/cfg/models/11/yolo11-C3k2-MogaBlock.yaml 使用[MogaNet ICLR2024](https://github.com/Westlake-AI/MogaNet)中的MogaBlock改进C3k2. 56. ultralytics/cfg/models/11/yolo11-C3k2-IdentityFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的IdentityFormer改进C3k2. 57. ultralytics/cfg/models/11/yolo11-C3k2-RandomMixing.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的RandomMixingFormer改进C3k2.(需要看[常见错误和解决方案的第五点](#a)) 58. ultralytics/cfg/models/11/yolo11-C3k2-PoolingFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的PoolingFormer改进C3k2. 59. ultralytics/cfg/models/11/yolo11-C3k2-ConvFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的ConvFormer改进C3k2. 60. ultralytics/cfg/models/11/yolo11-C3k2-CaFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的CaFormer改进C3k2. 61. ultralytics/cfg/models/11/yolo11-C3k2-FFCM.yaml 使用[Efficient Frequency-Domain Image Deraining with Contrastive Regularization ECCV2024](https://github.com/deng-ai-lab/FADformer)中的Fused_Fourier_Conv_Mixer改C3k2. 62. ultralytics/cfg/models/11/yolo11-C3k2-SFHF.yaml 使用[SFHformer ECCV2024](https://github.com/deng-ai-lab/SFHformer)中的block改进C3k2. 63. ultralytics/cfg/models/11/yolo11-C3k2-MSM.yaml 使用[Revitalizing Convolutional Network for Image Restoration TPAMI2024](https://zhuanlan.zhihu.com/p/720777160)中的MSM改进C3k2. 64. ultralytics/cfg/models/11/yolo11-C3k2-HDRAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的RAB( residual attention block)改进C3k2. 65. ultralytics/cfg/models/11/yolo11-C3k2-RAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的HDRAB(hybrid dilated residual attention block)改进C3k2. 66. ultralytics/cfg/models/11/yolo11-C3k2-LFE.yaml 使用[Efficient Long-Range Attention Network for Image Super-resolution ECCV2022](https://github.com/xindongzhang/ELAN)中的Local feature extraction改进C3k2. 67. ultralytics/cfg/models/11/yolo11-C3k2-SFA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention-SFA改进C3k2. 68. ultralytics/cfg/models/11/yolo11-C3k2-CTA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention-CTA改进C3k2. 69. ultralytics/cfg/models/11/yolo11-C3k2-IDWC.yaml 使用[InceptionNeXt CVPR2024](https://github.com/sail-sg/inceptionnext)中的InceptionDWConv2d改进C3k2. 70. ultralytics/cfg/models/11/yolo11-C3k2-IDWD.yaml 使用[InceptionNeXt CVPR2024](https://github.com/sail-sg/inceptionnext)中的InceptionDWBlock改进C3k2. 71. ultralytics/cfg/models/11/yolo11-C3k2-PConv.yaml 使用[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的PConv改进C3k2. 72. ultralytics/cfg/models/11/yolo11-C3k2-EMA.yaml B站注意力教程例子.链接:https://www.bilibili.com/video/BV1mXkVYAEGM/ 73. ultralytics/cfg/models/11/yolo11-C3k2-CAMixer.yaml 使用[CAMixerSR CVPR2024](https://github.com/icandle/CAMixerSR)中的CAMixer改进C3k2. 74. ultralytics/cfg/models/11/yolo11-MAN.yaml 使用[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Mixed Aggregation Network改进yolov11. 75. ultralytics/cfg/models/11/yolo11-C3k2-HFERB.yaml 使用[ICCV2023 CRAFT-SR](https://github.com/AVC2-UESTC/CRAFT-SR)中的high-frequency enhancement residual block改进C3k2. 76. ultralytics/cfg/models/11/yolo11-C3k2-DTAB.yaml 使用[AAAI2025 TBSN](https://github.com/nagejacob/TBSN)中的DTAB改进C3k2. 77. ultralytics/cfg/models/11/yolo11-C3k2-JDPM.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的joint domain perception module改进C3k2. 78. ultralytics/cfg/models/11/yolo11-C3k2-ETB.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的entanglement transformer block改进C3k2. 79. ultralytics/cfg/models/11/yolo11-C3k2-FDT.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Full-domain Transformer改进C3k2. 80. ultralytics/cfg/models/11/yolo11-C3k2-AP.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Asymmetric Padding bottleneck改进yolo11. 81. ultralytics/cfg/models/11/yolo11-C3k2-Kat.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAT改进C3k2. 82. ultralytics/cfg/models/11/yolo11-C3k2-ELGCA.yaml 使用[ELGC-Net](https://github.com/techmn/elgcnet)中的ELGCA改进C3k2. 83. ultralytics/cfg/models/11/yolo11-C3k2-Strip.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock改进C3k2. 84. ultralytics/cfg/models/11/yolo11-C3k2-GlobalFilter.yaml 使用[T-PAMI Global Filter Networks for Image Classification](https://github.com/raoyongming/GFNet)中的GlobalFilterBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C3k2. 85. ultralytics/cfg/models/11/yolo11-C3k2-DynamicFilter.yaml 使用[AAAI2024 FFT-Based Dynamic Token Mixer for Vision](https://github.com/okojoalg/dfformer)中的DynamicFilter改进C3k2. 86. ultralytics/cfg/models/11/yolo11-C3k2-TSSA.yaml 使用[Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention和[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)的metaformer改进C3k2. 87. ultralytics/cfg/models/11/yolo11-RepHMS.yaml 使用[MHAF-YOLO](https://github.com/yang-0201/MHAF-YOLO)中的RepHMS改进yolo11. 88. ultralytics/cfg/models/11/yolo11-C3k2-SAVSS.yaml 使用[CVPR2025 SCSegamba](https://github.com/Karl1109/SCSegamba)中的Structure-Aware Scanning Strategy改进C3k2. 89. ultralytics/cfg/models/11/yolo11-C3k2-MobileMamba.yaml 使用[CVPR2025 MobileMamba](https://github.com/lewandofskee/MobileMamba)中的MobileMambaBlock改进C3k2. 90. ultralytics/cfg/models/11/yolo11-C3k2-MambaOut.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock改进C3k2. 91. ultralytics/cfg/models/11/yolo11-C3k2-EfficientVIM.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock改进C3k2. 92. ultralytics/cfg/models/11/yolo11-C3k2-RCB.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的RepConvBlock改进C3k2. 93. ultralytics/cfg/models/11/yolo11-C3k2-LEGM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的LEGM改进C3k2. 94. ultralytics/cfg/models/11/yolo11-C3k2-FAT.yaml 使用[ICLR2024-FTIC](https://github.com/qingshi9974/ICLR2024-FTIC)中的FATBlock改进C3k2. 95. ultralytics/cfg/models/11/yolo11-C3k2-LFEM.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LFEModule改进C3k2. 96. ultralytics/cfg/models/11/yolo11-C3k2-SBSM.yaml 使用[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Snake Bi-Directional Sequence Modelling (SBSM)改进C3k2. 97. ultralytics/cfg/models/11/yolo11-C3k2-LSBlock.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSBlock改进C3k2. 98. ultralytics/cfg/models/11/yolo11-C3k2-TransMamba.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的TransMamba改进C3k2. 99. ultralytics/cfg/models/11/yolo11-C3k2-EVS.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EVS改进C3k2. 100. ultralytics/cfg/models/11/yolo11-C3k2-EBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的EBlock改进C3k2. 101. ultralytics/cfg/models/11/yolo11-C3k2-DBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的DBlock改进C3k2. 102. ultralytics/cfg/models/11/yolo11-C3k2-FDConv.yaml 使用[CVPR2025 Frequency Dynamic Convolution for Dense Image Prediction](https://github.com/Linwei-Chen/FDConv)的FDConv改进C3k2. 103. ultralytics/cfg/models/11/yolo11-C3k2-DSAN.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention Block改进C3k2. 104. ultralytics/cfg/models/11/yolo11-C3k2-DSA.yaml 使用[DSA: Deformable Spatial Attention](https://www.techrxiv.org/users/628671/articles/775010-deformable-spatial-attention-networks-enhancing-lightweight-convolutional-models-for-vision-tasks)中的Deformable Spatial Attention改进C3k2. 105. ultralytics/cfg/models/11/yolo11-C3k2-RMB.yaml 使用[CVPR2025 MaIR](https://github.com/XLearning-SCU/2025-CVPR-MaIR)中的Residual Mamba Block改进C3k2. 106. ultralytics/cfg/models/11/yolo11-C3k2-SFSConv.yaml 使用[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv改进C3k2. 107. ultralytics/cfg/models/11/yolo11-C3k2-GroupMamba.yaml 使用[CVPR2025 GroupMamba](https://github.com/Amshaker/GroupMamba)中的GroupMambaLayer改进C3k2. 108. ultralytics/cfg/models/11/yolo11-C3k2-GroupMambaBlock.yaml 使用[CVPR2025 GroupMamba](https://github.com/Amshaker/GroupMamba)中的GroupMambaBlock改进C3k2. 109. ultralytics/cfg/models/11/yolo11-C3k2-MambaVision.yaml 使用[CVPR2025 MambaVision](https://github.com/NVlabs/MambaVision)中的MambaVision改进C3k2. 110. ultralytics/cfg/models/11/yolo11-FCM.yaml 使用[AAAI2025 FBRT-YOLO](https://github.com/galaxy-oss/FCM)的模块改进yolo11. 111. ultralytics/cfg/models/12/yolo12-FCM.yaml 使用[AAAI2025 FBRT-YOLO](https://github.com/galaxy-oss/FCM)的模块改进yolo12. 112. ultralytics/cfg/models/11/yolo11-C3k2-wConv.yaml 使用[weightedConvolution2.0](https://github.com/cammarasana123/weightedConvolution2.0)中的wConv2d改进C3k2. 113. ultralytics/cfg/models/11/yolo11-C3k2-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进C3k2. 114. ultralytics/cfg/models/11/yolo11-C3k2-GLVSS.yaml 使用[TGRS2025 UMFormer](https://github.com/takeyoutime/UMFormer)中的GLVSS改进C3k2. 115. ultralytics/cfg/models/11/yolo11-C3k2-ESC.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ESC改进C3k2. 116. ultralytics/cfg/models/11/yolo11-C3k2-MBRConv3.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv3改进C3k2. 117. ultralytics/cfg/models/11/yolo11-C3k2-MBRConv5.yaml 使用[ICCV2025 MobileIE](https://github.com/AVC2-UESTC/MobileIE)中的MBRConv5改进C3k2. 118. ultralytics/cfg/models/11/yolo11-C3k2-VSSD.yaml 使用[ICCV2025 VSSD](https://github.com/YuHengsss/VSSD)中的VSSD改进C3k2. 119. ultralytics/cfg/models/11/yolo11-C3k2-TinyVIM.yaml 使用[ICCV2025 TinyVIM](https://arxiv.org/abs/2411.17473)中的TinyVIMBlock改进C3k2. 120. ultralytics/cfg/models/11/yolo11-C3k2-CSI.yaml 使用[INFFUS2025 SAMamba](https://arxiv.org/pdf/2505.23214)中的CSI改进C3k2. 121. ultralytics/cfg/models/11/yolo11-C3k2-ConvAttn.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ConvAttn改进C3k2. 122. ultralytics/cfg/models/11/yolo11-C3k2-UniConv.yaml 使用[ICCV2025 UniConvBlock](https://github.com/ai-paperwithcode/UniConvNet)中的UniConvBlock改进C3k2. 123. ultralytics/cfg/models/11/yolo11-C3k2-LGLB.yaml 使用[ACM MM 2025 Mobile U-ViT](https://github.com/FengheTan9/Mobile-U-ViT)中的LGLBBlock改进C3k2. 124. ultralytics/cfg/models/11/yolo11-C3k2-ConverseB.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的ConverseBlock改进C3k2. 125. ultralytics/cfg/models/11/yolo11-C3k2-Converse.yaml 使用[ICCV2025 ConverseBNet](https://github.com/cszn/ConverseNet)中的Converse2D改进C3k2. 126. ultralytics/cfg/models/11/yolo11-C3k2-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进C3k2. 127. ultralytics/cfg/models/11/yolo11-C3k2-CFBlock.yaml 使用[AAAI2024 SCTNet](https://arxiv.org/pdf/2312.17071)中的CFBlock改进C3k2. 128. ultralytics/cfg/models/11/yolo11-C3k2-FMABlock.yaml 使用[IJCV2024 SRConvNet](https://github.com/lifengcs/SRConvNet)中的FMABlock改进C3k2. 129. ultralytics/cfg/models/11/yolo11-C3k2-LWGA.yaml 使用[LWGANet](https://github.com/lwCVer/LWGANet)中的LWGABlock改进C3k2. 130. ultralytics/cfg/models/11/yolo11-C3k2-CSSC.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CSSC改进C3k2. 131. ultralytics/cfg/models/11/yolo11-C3k2-CNCM.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CNCM改进C3k2. 132. ultralytics/cfg/models/11/yolo11-C3k2-HFRB.yaml 使用[ICCV2025 HFRB](https://arxiv.org/pdf/2507.10689)中的HFRB改进C3k2. 133. ultralytics/cfg/models/11/yolo11-C3k2-EVA.yaml 使用[ICIP2025 BEVANET](https://arxiv.org/pdf/2508.07300)中的EVA改进C3k2. 134. ultralytics/cfg/models/11/yolo11-C3k2-RMBC.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv改进C3k2. 135. ultralytics/cfg/models/11/yolo11-C3k2-RMBC-LA.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv和Local Importance-based Attention改进C3k2. 136. ultralytics/cfg/models/11/yolo11-C3k2-IEL.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的IEL改进C3k2. 137. ultralytics/cfg/models/11/yolo11-C3k2-SFMB.yaml 使用[TIP2025 SFMB](https://arxiv.org/pdf/2511.06593v1)中的SFMB改进C3k2. 138. ultralytics/cfg/models/11/yolo11-C3k2-MFEB.yaml 使用[MICCAI2023 SHISRCNet](https://arxiv.org/abs/2306.14119)中的MFEB改进C3k2. 139. ultralytics/cfg/models/11/yolo11-C3k2-PartialNetBlock.yaml 使用[AAAI2026 Partial Channel Network](https://arxiv.org/pdf/2502.01303)中的PartialNetBlock改进C3k2. 140. ultralytics/cfg/models/11/yolo11-C3k2-DRG.yaml 使用[TGRS2025 DRPCA-Net](https://arxiv.org/pdf/2507.09541)中的DRG改进C3k2. 151. ultralytics/cfg/models/11/yolo11-C3k2-GLGM.yaml 使用[TGRS2025 ISGLNet](https://ieeexplore.ieee.org/document/11232501)中的GLGM改进C3k2. 152. ultralytics/cfg/models/11/yolo11-C3k2-MAC.yaml 使用[TGRS2025 HDNet](https://ieeexplore.ieee.org/document/11232501)中的MAC改进C3k2. 153. ultralytics/cfg/models/11/yolo11-C3k2-SPJFB.yaml 使用[AAAI2026 SPJFNet](https://arxiv.org/pdf/2508.04041)中的SPJFBlock改进C3k2. 154. ultralytics/cfg/models/11/yolo11-C3k2-GLSS2D.yaml 使用[TGRS2025 GLVMamba](https://ieeexplore.ieee.org/document/11014226)中的GLSS2D改进C3k2. 155. ultralytics/cfg/models/11/yolo11-C3k2-DEGConv.yaml 使用[CVPR2026 MixerCSeg](https://arxiv.org/pdf/2603.01361)中的DEGConv改进C3k2. 156. ultralytics/cfg/models/11/yolo11-C3k2-TransMixer.yaml 使用[CVPR2026 TransMixer](https://arxiv.org/pdf/2603.01361)中的TransMixer改进C3k2. ### C2PSA系列 1. ultralytics/cfg/models/11/yolo11-C2BRA.yaml 使用[BIFormer CVPR2023](https://github.com/rayleizhu/BiFormer)中的Bi-Level Routing Attention改进C2PSA. 2. ultralytics/cfg/models/11/yolo11-C2CGA.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)中的CascadedGroupAttention改进C2PSA. 3. ultralytics/cfg/models/11/yolo11-C2DA.yaml 使用[Vision Transformer with Deformable Attention(CVPR2022)](https://github.com/LeapLabTHU/DAT)中的DAttention改进C2PSA. 4. ultralytics/cfg/models/11/yolo11-C2DPB.yaml 使用[CrossFormer](https://arxiv.org/pdf/2108.00154)中的DynamicPosBias-Attention改进C2PSA. 5. ultralytics/cfg/models/11/yolo11-DTAB.yaml 使用[AAAI2025 TBSN](https://github.com/nagejacob/TBSN)中的DTAB替换C2PSA. 6. ultralytics/cfg/models/11/yolo11-ETB.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的entanglement transformer block替换C2PSA. 7. ultralytics/cfg/models/11/yolo11-FDT.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Full-domain Transformer替换C2PSA. 8. ultralytics/cfg/models/11/yolo11-C2Pola.yaml 使用[ICLR2025 PolaFormer)](https://github.com/ZacharyMeng/PolaFormer)中的PolaAttention改进C2PSA. 9. ultralytics/cfg/models/11/yolo11-C2TSSA.yaml 使用[Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention改进C2PSA. 10. ultralytics/cfg/models/11/yolo11-C2ASSA.yaml 使用[CVPR2024 Adapt or Perish: Adaptive Sparse Transformer with Attentive Feature Refinement for Image Restoration](https://openaccess.thecvf.com/content/CVPR2024/papers/Zhou_Adapt_or_Perish_Adaptive_Sparse_Transformer_with_Attentive_Feature_Refinement_CVPR_2024_paper.pdf)中的Adaptive Sparse Self-Attention改进C2PSA. 11. ultralytics/cfg/models/11/yolo11-ASSR.yaml 使用[CVPR2025 MambaIR](https://github.com/csguoh/MambaIR)中的Attentive State Space Group改进yolo11. 12. ultralytics/cfg/models/11/yolo11-C2PSA-DYT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTanh改进C2PSA. 13. ultralytics/cfg/models/11/yolo11-C2PSA-FMFFN.yaml 使用[ICLR2024-FTIC](https://github.com/qingshi9974/ICLR2024-FTIC)中的FMFFN改进C2PSA. 14. ultralytics/cfg/models/11/yolo11-C2PSA-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2PSA. 15. ultralytics/cfg/models/11/yolo11-C2PSA-SEFN.yaml 使用[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)改进C2PSA. 16. ultralytics/cfg/models/11/yolo11-C2PSA-Mona.yaml 使用[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进C2PSA. 17. ultralytics/cfg/models/11/yolo11-C2PSA-SEFFN.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的SpectralEnhancedFFN改进C2PSA. 18. ultralytics/cfg/models/11/yolo11-C2PSA-EDFFN.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN改进C2PSA. 19. ultralytics/cfg/models/11/yolo11-C2MSLA.yaml 使用[MSLA](https://arxiv.org/pdf/2505.18823)改进C2PSA. 20. ultralytics/cfg/models/11/yolo11-C2PSA-EPGO.yaml 使用[ACM MM 2025 CPRAformer](https://github.com/zs1314/CPRAformer)中的EPGO改进C2PSA中的self-attention. 21. ultralytics/cfg/models/11/yolo11-C2PSA-DML.yaml 使用[IJCV2024 SRConvNet](https://github.com/lifengcs/SRConvNet)中的DMI改进C2PSA. 22. ultralytics/cfg/models/11/yolo11-C2PSA-LRSA.yaml 使用[TPAMI2025 LRFormer](https://mmcheng.net/wp-content/uploads/2025/06/25PAMI_LRFormer.pdf)中的LRSA改进C2PSA. 23. ultralytics/cfg/models/11/yolo11-C2PSA-MALA.yaml 使用[ICCV2025 Rectifying Magnitude Neglect in Linear Attention](https://arxiv.org/pdf/2507.00698)中的MALA改进C2PSA. 24. ultralytics/cfg/models/11/yolo11-C2PSA-SWSA.yaml 使用[ACMMM2025 FlickCD](https://dl.acm.org/doi/epdf/10.1145/3746027.3755657)中的SWSA改进C2PSA. 25. ultralytics/cfg/models/11/yolo11-C2PSA-EGSA.yaml 使用[ACMMM2025 FlickCD](https://dl.acm.org/doi/epdf/10.1145/3746027.3755657)中的EGSA改进C2PSA. 26. ultralytics/cfg/models/11/yolo11-C2DWMMSA.yaml 使用[TGRS2025 USTNet](https://ieeexplore.ieee.org/document/11146454)中的DWMMSA改进C2PSA. 27. ultralytics/cfg/models/11/yolo11-C2BinaryAttn.yaml 使用[CVPR2026 BinaryAttention](https://arxiv.org/pdf/2303.08810)中的BinaryAttention改进C2PSA. 28. ultralytics/cfg/models/11/yolo11-C2WCA.yaml 使用[CVPR2025 Wavelet and Prototype Augmented Query-based Transformer for Pixel-level Surface Defect Detection](https://openaccess.thecvf.com/content/CVPR2025/papers/Yan_Wavelet_and_Prototype_Augmented_Query-based_Transformer_for_Pixel-level_Surface_Defect_CVPR_2025_paper.pdf)中的WCA改进C2PSA. ### A2C2f系列 1. ultralytics/cfg/models/12/yolo12-A2C2f-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进A2C2f. 2. ultralytics/cfg/models/12/yolo12-A2C2f-KAN.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN改进A2C2f. 3. ultralytics/cfg/models/12/yolo12-A2C2f-DFFN.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)中的DFFN改进A2C2f. 4. ultralytics/cfg/models/12/yolo12-A2C2f-FRFN.yaml 使用[CVPR2024 Adapt or Perish: Adaptive Sparse Transformer with Attentive Feature Refinement for Image Restoration](https://openaccess.thecvf.com/content/CVPR2024/papers/Zhou_Adapt_or_Perish_Adaptive_Sparse_Transformer_with_Attentive_Feature_Refinement_CVPR_2024_paper.pdf)中的feature refinement feed-forward改进A2C2f. 5. ultralytics/cfg/models/12/yolo12-A2C2f-DYT.yaml 使用[CVPR2025 DyT](https://github.com/jiachenzhu/DyT)中的DynamicTanh改进A2C2f. 6. ultralytics/cfg/models/12/yolo12-A2C2f-FMFFN.yaml 使用[ICLR2024-FTIC](https://github.com/qingshi9974/ICLR2024-FTIC)中的FMFFN改进A2C2f. 7. ultralytics/cfg/models/12/yolo12-A2C2f-SEFN.yaml 使用[WACV2025 SEM-Net](https://github.com/ChrisChen1023/SEM-Net)的Spatially-Enhanced Feedforward Network (SEFN)改进A2C2f. 8. ultralytics/cfg/models/12/yolo12-A2C2f-Mona.yaml 使用[CVPR2025 Mona](https://github.com/Leiyi-Hu/mona)的Mona改进A2C2f. 9. ultralytics/cfg/models/12/yolo12-A2C2f-SEFFN.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的SpectralEnhancedFFN改进A2C2f. 10. ultralytics/cfg/models/12/yolo12-A2C2f-EDFFN.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EDFFN改进A2C2f. ### 组合系列 1. ultralytics/cfg/models/11/yolo11-fasternet-bifpn.yaml fasternet与bifpn的结合. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode 其中目前(后续会更新喔)支持这些[结构](#b) 3. head_channel BIFPN中的通道数,默认设置为256. 2. ultralytics/cfg/models/11/yolo11-ELA-HSFPN-TADDH.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN,使用自研动态动态对齐检测头改进Head. 3. ultralytics/cfg/models/11/yolo11-FDPN-TADDH.yaml 自研结构的融合. 1. 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 2. 自研任务对齐动态检测头(Task Align Dynamic Detection Head) 4. ultralytics/cfg/models/11/yolo11-starnet-C3k2-Star-LSCD.yaml 轻量化模型组合. 1. CVPR2024-StarNet Backbone. 2. C3k2-Star. 3. Lightweight Shared Convolutional Detection Head. # Mamba-YOLO 1. [Mamba-YOLO](https://github.com/HZAI-ZJNU/Mamba-YOLO) 集成Mamba-YOLO.(需要编译请看百度云视频-20240619版本更新说明) ultralytics/cfg/models/mamba-yolo/Mamba-YOLO-T.yaml ultralytics/cfg/models/mamba-yolo/Mamba-YOLO-B.yaml ultralytics/cfg/models/mamba-yolo/Mamba-YOLO-L.yaml ultralytics/cfg/models/mamba-yolo/yolo-mamba-seg.yaml # Hyper-YOLO 1. Hyper-YOLO(TPAMI2025) 1. ultralytics/cfg/models/hyper-yolo/hyper-yolo.yaml 2. ultralytics/cfg/models/hyper-yolo/hyper-yolot.yaml 3. ultralytics/cfg/models/hyper-yolo/hyper-yolo-seg.yaml # 注意力系列 1. EMA 2. SimAM 3. SpatialGroupEnhance 4. BiLevelRoutingAttention, BiLevelRoutingAttention_nchw 5. TripletAttention 6. CoordAtt 7. CBAM 8. BAMBlock 9. EfficientAttention(CloFormer中的注意力) 10. LSKBlock 11. SEAttention 12. CPCA 13. deformable_LKA 14. EffectiveSEModule 15. LSKA 16. SegNext_Attention 17. DAttention(Vision Transformer with Deformable Attention CVPR2022) 18. FocusedLinearAttention(ICCV2023) 19. MLCA 20. TransNeXt_AggregatedAttention 21. LocalWindowAttention(EfficientViT中的CascadedGroupAttention注意力) 22. Efficient Local Attention[Efficient Local Attention](https://arxiv.org/abs/2403.01123) 23. CAA(CVPR2024 PKINet中的注意力) 24. CAFM 25. AFGCAttention[Neural Networks ECCV2024](https://www.sciencedirect.com/science/article/abs/pii/S0893608024002387) # Loss系列 1. SlideLoss,EMASlideLoss.(可动态调节正负样本的系数,让模型更加注重难分类,错误分类的样本上) 2. IoU,GIoU,DIoU,CIoU,EIoU,SIoU,MPDIoU,ShapeIoU. 3. Inner-IoU,Inner-GIoU,Inner-DIoU,Inner-CIoU,Inner-EIoU,Inner-SIoU,Inner-ShapeIoU. 4. Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 5. Inner-Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 6. FocalLoss,VarifocalLoss,QualityfocalLoss 7. Focaler-IoU系列(IoU,GIoU,DIoU,CIoU,EIoU,SIoU,WIoU,MPDIoU,ShapeIoU) 8. Powerful-IoU,Powerful-IoUV2,Inner-Powerful-IoU,Inner-Powerful-IoUV2,Focaler-Powerful-IoU,Focaler-Powerful-IoUV2,Wise-Powerful-IoU(v1,v2,v3),Wise-Powerful-IoUV2(v1,v2,v3)[论文链接](https://www.sciencedirect.com/science/article/abs/pii/S0893608023006640) 9. Normalized Gaussian Wasserstein Distance. 10. Gaussian Combined Distance. # 更新公告 - **20241013-yolov11-v1.1** 1. 初版发布。 - **20241018-yolov11-v1.2** 1. 移植完200+改进点。 2. 修复已知问题。 - **20241027-yolov11-v1.3** 1. 修复已知问题。 2. 新增自研CSP-MutilScaleEdgeInformationEnhance. 3. 新增Efficient Frequency-Domain Image Deraining with Contrastive Regularization中的Fused_Fourier_Conv_Mixer. 4. 更新使用教程. 5. 百度云视频增加20241027更新说明. - **20241103-yolov11-v1.4** 1. 新增自研Rep Shared Convolutional Detection Head. 2. 修复已知问题。 3. 增加实例分割、姿态检测、旋转目标检测怎么用里面的改进视频在使用说明. 4. 百度云视频增加20241103更新说明. - **20241112-yolov11-v1.5** 1. 新增自研CSP-FreqSpatial. 2. 新增SFHformer ECCV2024中的block改进C3k2. 3. 新增Revitalizing Convolutional Network for Image Restoration TPAMI2024中的MSM改进C3k2. 4. 更新使用教程. 5. 百度云视频增加20241112更新说明. 6. 修复一些已知问题. - **20241124-yolov11-v1.6** 1. 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新得到CSP-MutilScaleEdgeInformationSelect. 2. 新增Pattern Recognition 2024|DRANet中的HDRAB和RAB模块改进C3k2. 3. 新增ECCV2022-ELAN中的Local feature extraction改进C3k2. 4. 使用Bi-Level Routing Attention改进C2PSA. 5. 使用CascadedGroupAttention改进C2PSA. 6. 使用DAttention改进C2PSA. 7. 更新使用教程. 8. 百度云视频增加20241124更新说明. 9. 修复一些已知问题. - **20241207-yolov11-v1.7** 1. 新增自研GlobalEdgeInformationTransfer. 2. 新增FreqFormer的Frequency-aware Cascade Attention改进C3k2. 3. 新增CVPR2024InceptionNeXt中的IDWC、IDWB的改进. 4. 新增CrossFormer中的DynamicPosBias-Attention改进C2PSA. 5. 更新使用教程. 6. 百度云视频增加20241207更新说明. - **20241221-yolov11-v1.8** 1. 新增CAMixerSR中的CAMixer改进C3k2. 2. 新增支持Hyper-YOLO,并可以利用项目自带的改进改进Hyper-YOLO. 3. 新增Hyper-YOLO中的Hypergraph Computation in Semantic Space和Mixed Aggregation Network的改进. 4. 新增Fasternet中的PConv改进C3k2. 5. 新增一些注意力例子配合B站视频进行学习. 6. 更新使用教程. 7. 百度云视频增加20241221更新说明. - **20241228-yolov11-v1.9** 1. 新增基于Hyper-YOLO中的Mixed Aggregation Network三个二次改进系列. 2. 新增使用MSA^2 Net中的Multi-Scale Adaptive Spatial Attention Gate改进yolo11-neck. 3. 新增使用MSA^2 Net中的Multi-Scale Adaptive Spatial Attention Gate改进自研系列的MutilBackbone. 4. 更新使用教程. 5. 百度云视频增加20241228更新说明. - **20250112-yolo11-v1.10** 1. 新增CRAFT-SR中的high-frequency enhancement residual block. 2. 新增AAAI2025-TBSN中的DTAB. 3. 新增ECCV2024-FSEL中的多个模块. 4. 新增ACMMM2024-WFEN中的多个模块. 5. 更新使用教程. 6. 百度云视频增加20250112更新说明. - **20250119-yolo11-v1.11** 1. 新增AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection中的Pinwheel-shaped Convolution类型改进. 2. 新增AAAI2025 ConDSeg中的ContrastDrivenFeatureAggregation与ACMMM2024 WFEN中的小波变换进行创新. 3. 更新使用教程. 4. 百度云视频增加20250119更新说明. - **20250205-yolo11-v1.12** 1. 新增ELGC-Net的改进及其二次创新. 2. 新增ICLR2025 PolaFormer中的PolaAttention改进C2PSA. 3. 新增遥感目标检测Strip R-CNN中的StripBlock及其二次创新. 4. 新增BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation中的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention. 5. 新增ICLR2025 Kolmogorov-Arnold Transformer中的KAT及其配合FasterBlock的二次创新.<此模块需要编译> 6. 更新使用教程. 7. 百度云视频增加20250205更新说明. - **20250215-yolo11-v1.13** 1. 新增自研模块DynamicInceptionDWConv2d. 2. 新增GlobalFilter和DynamicFilter. 3. 更新使用教程. 4. 百度云视频增加20250215更新说明. - **20250222-yolo11-v1.14** 1. 新增yolo12配置文件.(包含目标检测、实例分割、姿态检测、旋转目标检测、分类) - **20250301-yolo11-v1.15** 1. 新增自研模块Hierarchical Attention Fusion并提供多种使用方式. 2. 新增ICLR2025-Token Statistics Transformer中的TSSA改进C3k2,C2PSA. 3. 新增MHAF-YOLO中的RepHMS.<这个是YOLO群内的一个博士新作品> 4. 新增对YOLO12的A2C2f结构中的MLP多个改进方案. 5. 调整了YOLO12中的注意力实现,会自动检测是否安装好Flash-Attention,没的话自动切换Torch实现. 6. 更新使用教程. 7. 百度云视频增加20250301更新说明. - **20250312-yolo11-v1.16** 1. 修复yolo11-ReCalibrationFPN-P2345.yaml的序号错误bug. 2. 新增CVPR2024-Adaptive Sparse Transformer相关改进yolo11,yolo12. 3. 新增CVPR2025-MambaIR的模块. 4. 新增CVPR2025-SCSegamba中的模块. 5. 新增CVPR2025-MobileMamba中的模块. 6. 新增CVPR2025-MambaOut中的模块. 7. 更新使用教程. 8. 百度云视频增加20250312更新说明. - **20250319-yolo11-v1.17** 1. 新增CVPR2025-Dynamic-Tanh的的多个改进并于其他模块的二次创新. 2. 修复C2PSA部分改进一些问题,详细看本期更新说明. 3. 更新使用教程. 4. 百度云视频增加20250319更新说明. - **20250322-yolo11-v1.18** 1. 同步yolo12官方代码最新推出的YOLOv12-turbo. - **20250329-yolo11-v1.19** 1. 新增CVPR2025-MambaOut与CVPR2024-UniRepLKNet二次创新后的模块. 2. 新增CVPR2025-EfficientViM和其与CVPR2024-TransNeXt的二次创新后的模块. 3. 新增CVPR2025-DEIM中的Localization Quality Estimation改进YOLOHead使其分类头同时具备分类score和预测框质量score. 4. 新增Localization Quality Estimation - Lightweight Shared Convolutional Detection Head. 5. 新增CVPR2024-EMCAD中的EUCB. 6. 新增CVPR2025-BHViT中的ShiftChannelMix和CVPR2024-EMCAD中的EUCB二次创新模块. 7. 新增yolo11-EMBSFPN.yaml方案上引入[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix. 8. 更新使用教程. 9. 百度云视频增加20250329更新说明. - **20250415-yolo11-v1.20** 1. 新增ICLR2024-FTIC中的多个模块. 2. 新增CVPR2024-TransNext中的CGLU改进C2PSA. 3. 新增CVPR2024-DCMPNet中的多个模块. 4. 新增CVPR2025-OverLock中的多个模块. 5. 新增统计配置文件的计算量和参数量并排序的脚本. 6. 更新使用教程. 7. 百度云视频增加20250415更新说明. - **20250502-yolo11-v1.21** 1. 新增LEGNet的LoGStem和LFEModule. 2. 新增WACV2025-SEMNet中的Snake Bi-Directional Sequence Modelling和Spatially-Enhanced Feedforward Network. 3. 新增CVPR2025-Mona中的多个改进和二次创新改进. 4. 新增新一代轻量化SOTA的CVPR2025-LSNet的LSNet和LSConv的多个改进和二次创新改进. 5. 修复MobileMamba训练速度极慢的问题. 6. 修改保存权重的逻辑,训练结束(注意是正常训练结束后,手动停止的没有)后统一会保存4个模型,分别是best.pt、last.pt、best_fp32.pt、last_fp32.pt,其中不带fp32后缀的是fp16格式保存的,但由于有些模块对fp16非常敏感,会出现后续使用val.py的时候精度为0的情况,这种情况下可以用后缀带fp32去测试。 7. 更新使用教程. 8. 百度云视频增加20250502更新说明. - **20250518-yolo11-v1.22** 1. 新增TransMamba中的多个改进. 2. 新增CVPR2025-EVSSM中的多个改进. 3. 新增CVPR2025-DarkIR中的多个改进. 4. 更新使用教程. 5. 百度云视频增加20250518更新说明. - **20250601-yolo11-v1.23** 1. 新增CVPR2025-FDConv的改进及其多个二次创新模块. 2. 新增DSA: Deformable Spatial Attention的改进及其多个二次创新模块. 3. 新增CVPR2025-MaIR中的Residual Mamba Block. 4. 更新使用教程. 5. 百度云视频增加20250601更新说明. - **20250612-yolo11-v1.24** 1. 新增ECCV2024-rethinkingfpn中的模块,并对原创改进SOEP再次创新。 2. 新增CVPR2024-SFSConv的改进及其多个二次创新模块. 3. 新增CVPR2025-GroupMamba中的模块. 4. 新增CVPR2025-MambaVision中的模块. 5. 新增AAAI2025-FBRTYOLO中的模块. 6. 更新使用教程. 7. 百度云视频增加20250612更新说明. - **20250624-yolo11-v1.25** 1. 新增YOLOV13配置文件(包含detect、seg、pose、obb)。 2. 更新使用教程. - **20250706-yolo11-v1.26** 1. 新增Pyramid Sparse Transformer改进yolo11-neck. 2. 新增Pyramid Sparse Transformer对SOEP再创新. 3. 新增weightedConvolution2.0. 4. 新增MIA2025-FourierConv. 5. 新增AAAI2025的HS-FPN. 6. 新增TGRS2025-UMFormer多个模块改进. 7. 更新使用教程. 8. 百度云视频增加20250706更新说明. - **20250721-yolo11-v1.27** 1. 新增ICCV2025-ESC中的模块. 2. 新增ICCV2025-MobileIE中的模块. 3. 新增ICCV2025-VSSD中的模块. 4. 新增ICCV2025-TinyVIM中的模块. 5. 新增MSLA. 6. 新增INFFUS2025-SAMamba中的模块. 7. 更新使用教程. 8. 百度云视频增加20250721更新说明. - **20250813-yolo11-v1.28** 1. 新增CPRAformer中的EPGO多个改进。 2. 新增ICCV2025-ESC中的ConvAttn改进。 3. 更新使用教程. 4. 百度云视频增加20250813更新说明. - **20250827-yolo11-v1.29** 1. 新增ICCV2025-UniConvBlock中的模块. 2. 新增ICCV2025-ConverseBNet中的模块. 3. 新增ACM MM 2025-Mobile U-ViT中的模块. 4. 更新使用教程. 5. 百度云视频增加20250827更新说明. - **20250912-yolo11-v1.30** 1. 新增CVPR2025-GCConv模块. 2. 新增AAAI2024-CFBlock模块. 3. 新增ICCV2023-FastViT中的RepStem模块. 4. 更新使用教程. 5. 百度云视频增加20250912更新说明. - **20251008-yolo11-v1.31** 1. 新增IJCV2024-SRConvNet中的模块. 2. 新增LWGANet中的模块. 3. 更新使用教程. 4. 百度云视频增加20251008更新说明. - **20251028-yolo11-v1.32** 1. 新增TGRS2025-ASCNet中的模块. 2. 新增ICCV2025-HFRB模块. 3. 新增ICIP2025-BEVANET中的模块. 4. 新增TPAMI2025-LRFormer中的模块. 5. 新增ICCV2025-Rectifying Magnitude Neglect in Linear Attention的模块. 6. 更新使用教程. 7. 百度云视频增加20251028更新说明. - **20251122-yolo11-v1.33** 1. 新增GRSL2025-Gaussian Combined Distance,支持在目标框损失和标签分配策略上更改,详细请看LOSS改进系列.md 2. 新增ACCV2024-PlainUSR中的模块. 3. 更新使用教程. 4. 百度云视频增加20251122更新说明. - **20251219-yolo11-v1.34** 1. 新增CVPR2025-HVI中的LCA模块. 2. 新增TIP2025-SFMB模块. 3. 新增TGRS2025-HAFNet中的HFFE模块. 4. 更新使用教程. 5. 百度云视频增加20251219更新说明. - **20260114-yolo11-v1.35** 1. 新增YOLO-Master中的MoE模块. 2. 新增ACMMM2025-FlickCD中的模块. 3. 更新使用教程. 4. 百度云视频增加20260114更新说明. - **20260203-yolo11-v1.36** 1. 新增TGRS2025-Think Locally and Act Globally中的模块. 2. 新增TGRS2025-ISGLNet中的多个模块. 3. 新增TGRS2025-MASFNet中的模块. 4. 更新使用教程. 5. 百度云视频增加20260203更新说明. - **20260224-yolo11-v1.37** 1. 新增MICCAI2023-SHISRCNet中的模块. 2. 新增AAAI2026-Partial Channel Network中的模块. 3. 新增TGRS2025-DRPCANet中的模块. 4. 新增TGRS2025-ISGLNet中的模块. 5. 新增TGRS2025-HDNet中的模块. 6. 更新使用教程. 7. 百度云视频增加20260223更新说明. - **20260307-yolo11-v1.38** 1. 优化detect.py中的特征图保存机制,使其可以单独保存每一个通道的特征图和总通道求和的特征图. 2. 优化训练过程的输出,增加训练过程中的mAP75输出. - **20260321-yolo11-v1.39** 1. 新增AAAI2026-SPJFBlock模块. 2. 新增TGRS2025-GLVMamba中的GLSS2D模块. 3. 新增TIP2025-DSMT中的CAFM模块. 4. 新增TGRS2025-USTNet中的DWMMSA模块. 5. 新增CVPR2026-MixerCSeg中的DEGConv模块. 6. 新增CVPR2026-BinaryAttention的模块. 7. 新增CVPR2026-TransMixer模块. 8. 新增CVPR2025-Wavelet and Prototype Augmented Query-based Transformer for Pixel-level Surface Defect Detection中的WCA模块. 9. 更新使用教程. 10. 百度云视频增加20260321更新说明. ================================================ FILE: yolo-improve/yolov5-AIFI.py ================================================ import torch import torch.nn as nn class TransformerEncoderLayer(nn.Module): """Defines a single layer of the transformer encoder.""" def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False): """Initialize the TransformerEncoderLayer with specified parameters.""" super().__init__() self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True) # Implementation of Feedforward model self.fc1 = nn.Linear(c1, cm) self.fc2 = nn.Linear(cm, c1) self.norm1 = nn.LayerNorm(c1) self.norm2 = nn.LayerNorm(c1) self.dropout = nn.Dropout(dropout) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.act = act self.normalize_before = normalize_before @staticmethod def with_pos_embed(tensor, pos=None): """Add position embeddings to the tensor if provided.""" return tensor if pos is None else tensor + pos def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None): """Performs forward pass with post-normalization.""" q = k = self.with_pos_embed(src, pos) src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src = src + self.dropout1(src2) src = self.norm1(src) src2 = self.fc2(self.dropout(self.act(self.fc1(src)))) src = src + self.dropout2(src2) return self.norm2(src) def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None): """Performs forward pass with pre-normalization.""" src2 = self.norm1(src) q = k = self.with_pos_embed(src2, pos) src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.fc2(self.dropout(self.act(self.fc1(src2)))) return src + self.dropout2(src2) def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None): """Forward propagates the input through the encoder module.""" if self.normalize_before: return self.forward_pre(src, src_mask, src_key_padding_mask, pos) return self.forward_post(src, src_mask, src_key_padding_mask, pos) class AIFI(TransformerEncoderLayer): """Defines the AIFI transformer layer.""" def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False): """Initialize the AIFI instance with specified parameters.""" super().__init__(c1, cm, num_heads, dropout, act, normalize_before) def forward(self, x): """Forward pass for the AIFI transformer layer.""" c, h, w = x.shape[1:] pos_embed = self.build_2d_sincos_position_embedding(w, h, c) # Flatten [B, C, H, W] to [B, HxW, C] x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype)) return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous() @staticmethod def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0): """Builds 2D sine-cosine position embedding.""" grid_w = torch.arange(int(w), dtype=torch.float32) grid_h = torch.arange(int(h), dtype=torch.float32) grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij') assert embed_dim % 4 == 0, \ 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' pos_dim = embed_dim // 4 omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim omega = 1. / (temperature ** omega) out_w = grid_w.flatten()[..., None] @ omega[None] out_h = grid_h.flatten()[..., None] @ omega[None] return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None] # yolov5 elif m is AIFI: args = [ch[f], *args] # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, Conv, [512, 1]], # 9 [-1, 1, AIFI, [1024, 8]], # 10 ] # YOLOv5 v6.0 head head: [[-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/benchmarks.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Run YOLOv5 benchmarks on all supported export formats Format | `export.py --include` | Model --- | --- | --- PyTorch | - | yolov5s.pt TorchScript | `torchscript` | yolov5s.torchscript ONNX | `onnx` | yolov5s.onnx OpenVINO | `openvino` | yolov5s_openvino_model/ TensorRT | `engine` | yolov5s.engine CoreML | `coreml` | yolov5s.mlmodel TensorFlow SavedModel | `saved_model` | yolov5s_saved_model/ TensorFlow GraphDef | `pb` | yolov5s.pb TensorFlow Lite | `tflite` | yolov5s.tflite TensorFlow Edge TPU | `edgetpu` | yolov5s_edgetpu.tflite TensorFlow.js | `tfjs` | yolov5s_web_model/ Requirements: $ pip install -r requirements.txt coremltools onnx onnx-simplifier onnxruntime openvino-dev tensorflow-cpu # CPU $ pip install -r requirements.txt coremltools onnx onnx-simplifier onnxruntime-gpu openvino-dev tensorflow # GPU $ pip install -U nvidia-tensorrt --index-url https://pypi.ngc.nvidia.com # TensorRT Usage: $ python benchmarks.py --weights yolov5s.pt --img 640 """ import argparse import platform import sys import time from pathlib import Path import pandas as pd FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH # ROOT = ROOT.relative_to(Path.cwd()) # relative import export from models.experimental import attempt_load from models.yolo import SegmentationModel from segment.val import run as val_seg from utils import notebook_init from utils.general import LOGGER, check_yaml, file_size, print_args from utils.torch_utils import select_device from val import run as val_det def run( weights=ROOT / 'yolov5s.pt', # weights path imgsz=640, # inference size (pixels) batch_size=1, # batch size data=ROOT / 'data/coco128.yaml', # dataset.yaml path device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu half=False, # use FP16 half-precision inference test=False, # test exports only pt_only=False, # test PyTorch only hard_fail=False, # throw error on benchmark failure ): y, t = [], time.time() device = select_device(device) model_type = type(attempt_load(weights, fuse=False)) # DetectionModel, SegmentationModel, etc. for i, (name, f, suffix, cpu, gpu) in export.export_formats().iterrows(): # index, (name, file, suffix, CPU, GPU) try: assert i not in (9, 10), 'inference not supported' # Edge TPU and TF.js are unsupported assert i != 5 or platform.system() == 'Darwin', 'inference only supported on macOS>=10.13' # CoreML if 'cpu' in device.type: assert cpu, 'inference not supported on CPU' if 'cuda' in device.type: assert gpu, 'inference not supported on GPU' # Export if f == '-': w = weights # PyTorch format else: w = export.run(weights=weights, imgsz=[imgsz], include=[f], device=device, half=half)[-1] # all others assert suffix in str(w), 'export failed' # Validate if model_type == SegmentationModel: result = val_seg(data, w, batch_size, imgsz, plots=False, device=device, task='speed', half=half) metric = result[0][7] # (box(p, r, map50, map), mask(p, r, map50, map), *loss(box, obj, cls)) else: # DetectionModel: result = val_det(data, w, batch_size, imgsz, plots=False, device=device, task='speed', half=half) metric = result[0][3] # (p, r, map50, map, *loss(box, obj, cls)) speed = result[2][1] # times (preprocess, inference, postprocess) y.append([name, round(file_size(w), 1), round(metric, 4), round(speed, 2)]) # MB, mAP, t_inference except Exception as e: if hard_fail: assert type(e) is AssertionError, f'Benchmark --hard-fail for {name}: {e}' LOGGER.warning(f'WARNING ⚠️ Benchmark failure for {name}: {e}') y.append([name, None, None, None]) # mAP, t_inference if pt_only and i == 0: break # break after PyTorch # Print results LOGGER.info('\n') parse_opt() notebook_init() # print system info c = ['Format', 'Size (MB)', 'mAP50-95', 'Inference time (ms)'] if map else ['Format', 'Export', '', ''] py = pd.DataFrame(y, columns=c) LOGGER.info(f'\nBenchmarks complete ({time.time() - t:.2f}s)') LOGGER.info(str(py if map else py.iloc[:, :2])) if hard_fail and isinstance(hard_fail, str): metrics = py['mAP50-95'].array # values to compare to floor floor = eval(hard_fail) # minimum metric floor to pass, i.e. = 0.29 mAP for YOLOv5n assert all(x > floor for x in metrics if pd.notna(x)), f'HARD FAIL: mAP50-95 < floor {floor}' return py def test( weights=ROOT / 'yolov5s.pt', # weights path imgsz=640, # inference size (pixels) batch_size=1, # batch size data=ROOT / 'data/coco128.yaml', # dataset.yaml path device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu half=False, # use FP16 half-precision inference test=False, # test exports only pt_only=False, # test PyTorch only hard_fail=False, # throw error on benchmark failure ): y, t = [], time.time() device = select_device(device) for i, (name, f, suffix, gpu) in export.export_formats().iterrows(): # index, (name, file, suffix, gpu-capable) try: w = weights if f == '-' else \ export.run(weights=weights, imgsz=[imgsz], include=[f], device=device, half=half)[-1] # weights assert suffix in str(w), 'export failed' y.append([name, True]) except Exception: y.append([name, False]) # mAP, t_inference # Print results LOGGER.info('\n') parse_opt() notebook_init() # print system info py = pd.DataFrame(y, columns=['Format', 'Export']) LOGGER.info(f'\nExports complete ({time.time() - t:.2f}s)') LOGGER.info(str(py)) return py def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='weights path') parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)') parser.add_argument('--batch-size', type=int, default=1, help='batch size') parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference') parser.add_argument('--test', action='store_true', help='test exports only') parser.add_argument('--pt-only', action='store_true', help='test PyTorch only') parser.add_argument('--hard-fail', nargs='?', const=True, default=False, help='Exception on error or < min metric') opt = parser.parse_args() opt.data = check_yaml(opt.data) # check YAML print_args(vars(opt)) return opt def main(opt): test(**vars(opt)) if opt.test else run(**vars(opt)) if __name__ == '__main__': opt = parse_opt() main(opt) ================================================ FILE: yolo-improve/yolov5-AUX/data/Argoverse.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Argoverse-HD dataset (ring-front-center camera) http://www.cs.cmu.edu/~mengtial/proj/streaming/ by Argo AI # Example usage: python train.py --data Argoverse.yaml # parent # ├── yolov5 # └── datasets # └── Argoverse ← downloads here (31.3 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/Argoverse # dataset root dir train: Argoverse-1.1/images/train/ # train images (relative to 'path') 39384 images val: Argoverse-1.1/images/val/ # val images (relative to 'path') 15062 images test: Argoverse-1.1/images/test/ # test images (optional) https://eval.ai/web/challenges/challenge-page/800/overview # Classes names: 0: person 1: bicycle 2: car 3: motorcycle 4: bus 5: truck 6: traffic_light 7: stop_sign # Download script/URL (optional) --------------------------------------------------------------------------------------- download: | import json from tqdm import tqdm from utils.general import download, Path def argoverse2yolo(set): labels = {} a = json.load(open(set, "rb")) for annot in tqdm(a['annotations'], desc=f"Converting {set} to YOLOv5 format..."): img_id = annot['image_id'] img_name = a['images'][img_id]['name'] img_label_name = f'{img_name[:-3]}txt' cls = annot['category_id'] # instance class id x_center, y_center, width, height = annot['bbox'] x_center = (x_center + width / 2) / 1920.0 # offset and scale y_center = (y_center + height / 2) / 1200.0 # offset and scale width /= 1920.0 # scale height /= 1200.0 # scale img_dir = set.parents[2] / 'Argoverse-1.1' / 'labels' / a['seq_dirs'][a['images'][annot['image_id']]['sid']] if not img_dir.exists(): img_dir.mkdir(parents=True, exist_ok=True) k = str(img_dir / img_label_name) if k not in labels: labels[k] = [] labels[k].append(f"{cls} {x_center} {y_center} {width} {height}\n") for k in labels: with open(k, "w") as f: f.writelines(labels[k]) # Download dir = Path(yaml['path']) # dataset root dir urls = ['https://argoverse-hd.s3.us-east-2.amazonaws.com/Argoverse-HD-Full.zip'] download(urls, dir=dir, delete=False) # Convert annotations_dir = 'Argoverse-HD/annotations/' (dir / 'Argoverse-1.1' / 'tracking').rename(dir / 'Argoverse-1.1' / 'images') # rename 'tracking' to 'images' for d in "train.json", "val.json": argoverse2yolo(dir / annotations_dir / d) # convert VisDrone annotations to YOLO labels ================================================ FILE: yolo-improve/yolov5-AUX/data/GlobalWheat2020.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Global Wheat 2020 dataset http://www.global-wheat.com/ by University of Saskatchewan # Example usage: python train.py --data GlobalWheat2020.yaml # parent # ├── yolov5 # └── datasets # └── GlobalWheat2020 ← downloads here (7.0 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/GlobalWheat2020 # dataset root dir train: # train images (relative to 'path') 3422 images - images/arvalis_1 - images/arvalis_2 - images/arvalis_3 - images/ethz_1 - images/rres_1 - images/inrae_1 - images/usask_1 val: # val images (relative to 'path') 748 images (WARNING: train set contains ethz_1) - images/ethz_1 test: # test images (optional) 1276 images - images/utokyo_1 - images/utokyo_2 - images/nau_1 - images/uq_1 # Classes names: 0: wheat_head # Download script/URL (optional) --------------------------------------------------------------------------------------- download: | from utils.general import download, Path # Download dir = Path(yaml['path']) # dataset root dir urls = ['https://zenodo.org/record/4298502/files/global-wheat-codalab-official.zip', 'https://github.com/ultralytics/yolov5/releases/download/v1.0/GlobalWheat2020_labels.zip'] download(urls, dir=dir) # Make Directories for p in 'annotations', 'images', 'labels': (dir / p).mkdir(parents=True, exist_ok=True) # Move for p in 'arvalis_1', 'arvalis_2', 'arvalis_3', 'ethz_1', 'rres_1', 'inrae_1', 'usask_1', \ 'utokyo_1', 'utokyo_2', 'nau_1', 'uq_1': (dir / p).rename(dir / 'images' / p) # move to /images f = (dir / p).with_suffix('.json') # json file if f.exists(): f.rename((dir / 'annotations' / p).with_suffix('.json')) # move to /annotations ================================================ FILE: yolo-improve/yolov5-AUX/data/ImageNet.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # ImageNet-1k dataset https://www.image-net.org/index.php by Stanford University # Simplified class names from https://github.com/anishathalye/imagenet-simple-labels # Example usage: python classify/train.py --data imagenet # parent # ├── yolov5 # └── datasets # └── imagenet ← downloads here (144 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/imagenet # dataset root dir train: train # train images (relative to 'path') 1281167 images val: val # val images (relative to 'path') 50000 images test: # test images (optional) # Classes names: 0: tench 1: goldfish 2: great white shark 3: tiger shark 4: hammerhead shark 5: electric ray 6: stingray 7: cock 8: hen 9: ostrich 10: brambling 11: goldfinch 12: house finch 13: junco 14: indigo bunting 15: American robin 16: bulbul 17: jay 18: magpie 19: chickadee 20: American dipper 21: kite 22: bald eagle 23: vulture 24: great grey owl 25: fire salamander 26: smooth newt 27: newt 28: spotted salamander 29: axolotl 30: American bullfrog 31: tree frog 32: tailed frog 33: loggerhead sea turtle 34: leatherback sea turtle 35: mud turtle 36: terrapin 37: box turtle 38: banded gecko 39: green iguana 40: Carolina anole 41: desert grassland whiptail lizard 42: agama 43: frilled-necked lizard 44: alligator lizard 45: Gila monster 46: European green lizard 47: chameleon 48: Komodo dragon 49: Nile crocodile 50: American alligator 51: triceratops 52: worm snake 53: ring-necked snake 54: eastern hog-nosed snake 55: smooth green snake 56: kingsnake 57: garter snake 58: water snake 59: vine snake 60: night snake 61: boa constrictor 62: African rock python 63: Indian cobra 64: green mamba 65: sea snake 66: Saharan horned viper 67: eastern diamondback rattlesnake 68: sidewinder 69: trilobite 70: harvestman 71: scorpion 72: yellow garden spider 73: barn spider 74: European garden spider 75: southern black widow 76: tarantula 77: wolf spider 78: tick 79: centipede 80: black grouse 81: ptarmigan 82: ruffed grouse 83: prairie grouse 84: peacock 85: quail 86: partridge 87: grey parrot 88: macaw 89: sulphur-crested cockatoo 90: lorikeet 91: coucal 92: bee eater 93: hornbill 94: hummingbird 95: jacamar 96: toucan 97: duck 98: red-breasted merganser 99: goose 100: black swan 101: tusker 102: echidna 103: platypus 104: wallaby 105: koala 106: wombat 107: jellyfish 108: sea anemone 109: brain coral 110: flatworm 111: nematode 112: conch 113: snail 114: slug 115: sea slug 116: chiton 117: chambered nautilus 118: Dungeness crab 119: rock crab 120: fiddler crab 121: red king crab 122: American lobster 123: spiny lobster 124: crayfish 125: hermit crab 126: isopod 127: white stork 128: black stork 129: spoonbill 130: flamingo 131: little blue heron 132: great egret 133: bittern 134: crane (bird) 135: limpkin 136: common gallinule 137: American coot 138: bustard 139: ruddy turnstone 140: dunlin 141: common redshank 142: dowitcher 143: oystercatcher 144: pelican 145: king penguin 146: albatross 147: grey whale 148: killer whale 149: dugong 150: sea lion 151: Chihuahua 152: Japanese Chin 153: Maltese 154: Pekingese 155: Shih Tzu 156: King Charles Spaniel 157: Papillon 158: toy terrier 159: Rhodesian Ridgeback 160: Afghan Hound 161: Basset Hound 162: Beagle 163: Bloodhound 164: Bluetick Coonhound 165: Black and Tan Coonhound 166: Treeing Walker Coonhound 167: English foxhound 168: Redbone Coonhound 169: borzoi 170: Irish Wolfhound 171: Italian Greyhound 172: Whippet 173: Ibizan Hound 174: Norwegian Elkhound 175: Otterhound 176: Saluki 177: Scottish Deerhound 178: Weimaraner 179: Staffordshire Bull Terrier 180: American Staffordshire Terrier 181: Bedlington Terrier 182: Border Terrier 183: Kerry Blue Terrier 184: Irish Terrier 185: Norfolk Terrier 186: Norwich Terrier 187: Yorkshire Terrier 188: Wire Fox Terrier 189: Lakeland Terrier 190: Sealyham Terrier 191: Airedale Terrier 192: Cairn Terrier 193: Australian Terrier 194: Dandie Dinmont Terrier 195: Boston Terrier 196: Miniature Schnauzer 197: Giant Schnauzer 198: Standard Schnauzer 199: Scottish Terrier 200: Tibetan Terrier 201: Australian Silky Terrier 202: Soft-coated Wheaten Terrier 203: West Highland White Terrier 204: Lhasa Apso 205: Flat-Coated Retriever 206: Curly-coated Retriever 207: Golden Retriever 208: Labrador Retriever 209: Chesapeake Bay Retriever 210: German Shorthaired Pointer 211: Vizsla 212: English Setter 213: Irish Setter 214: Gordon Setter 215: Brittany 216: Clumber Spaniel 217: English Springer Spaniel 218: Welsh Springer Spaniel 219: Cocker Spaniels 220: Sussex Spaniel 221: Irish Water Spaniel 222: Kuvasz 223: Schipperke 224: Groenendael 225: Malinois 226: Briard 227: Australian Kelpie 228: Komondor 229: Old English Sheepdog 230: Shetland Sheepdog 231: collie 232: Border Collie 233: Bouvier des Flandres 234: Rottweiler 235: German Shepherd Dog 236: Dobermann 237: Miniature Pinscher 238: Greater Swiss Mountain Dog 239: Bernese Mountain Dog 240: Appenzeller Sennenhund 241: Entlebucher Sennenhund 242: Boxer 243: Bullmastiff 244: Tibetan Mastiff 245: French Bulldog 246: Great Dane 247: St. Bernard 248: husky 249: Alaskan Malamute 250: Siberian Husky 251: Dalmatian 252: Affenpinscher 253: Basenji 254: pug 255: Leonberger 256: Newfoundland 257: Pyrenean Mountain Dog 258: Samoyed 259: Pomeranian 260: Chow Chow 261: Keeshond 262: Griffon Bruxellois 263: Pembroke Welsh Corgi 264: Cardigan Welsh Corgi 265: Toy Poodle 266: Miniature Poodle 267: Standard Poodle 268: Mexican hairless dog 269: grey wolf 270: Alaskan tundra wolf 271: red wolf 272: coyote 273: dingo 274: dhole 275: African wild dog 276: hyena 277: red fox 278: kit fox 279: Arctic fox 280: grey fox 281: tabby cat 282: tiger cat 283: Persian cat 284: Siamese cat 285: Egyptian Mau 286: cougar 287: lynx 288: leopard 289: snow leopard 290: jaguar 291: lion 292: tiger 293: cheetah 294: brown bear 295: American black bear 296: polar bear 297: sloth bear 298: mongoose 299: meerkat 300: tiger beetle 301: ladybug 302: ground beetle 303: longhorn beetle 304: leaf beetle 305: dung beetle 306: rhinoceros beetle 307: weevil 308: fly 309: bee 310: ant 311: grasshopper 312: cricket 313: stick insect 314: cockroach 315: mantis 316: cicada 317: leafhopper 318: lacewing 319: dragonfly 320: damselfly 321: red admiral 322: ringlet 323: monarch butterfly 324: small white 325: sulphur butterfly 326: gossamer-winged butterfly 327: starfish 328: sea urchin 329: sea cucumber 330: cottontail rabbit 331: hare 332: Angora rabbit 333: hamster 334: porcupine 335: fox squirrel 336: marmot 337: beaver 338: guinea pig 339: common sorrel 340: zebra 341: pig 342: wild boar 343: warthog 344: hippopotamus 345: ox 346: water buffalo 347: bison 348: ram 349: bighorn sheep 350: Alpine ibex 351: hartebeest 352: impala 353: gazelle 354: dromedary 355: llama 356: weasel 357: mink 358: European polecat 359: black-footed ferret 360: otter 361: skunk 362: badger 363: armadillo 364: three-toed sloth 365: orangutan 366: gorilla 367: chimpanzee 368: gibbon 369: siamang 370: guenon 371: patas monkey 372: baboon 373: macaque 374: langur 375: black-and-white colobus 376: proboscis monkey 377: marmoset 378: white-headed capuchin 379: howler monkey 380: titi 381: Geoffroy's spider monkey 382: common squirrel monkey 383: ring-tailed lemur 384: indri 385: Asian elephant 386: African bush elephant 387: red panda 388: giant panda 389: snoek 390: eel 391: coho salmon 392: rock beauty 393: clownfish 394: sturgeon 395: garfish 396: lionfish 397: pufferfish 398: abacus 399: abaya 400: academic gown 401: accordion 402: acoustic guitar 403: aircraft carrier 404: airliner 405: airship 406: altar 407: ambulance 408: amphibious vehicle 409: analog clock 410: apiary 411: apron 412: waste container 413: assault rifle 414: backpack 415: bakery 416: balance beam 417: balloon 418: ballpoint pen 419: Band-Aid 420: banjo 421: baluster 422: barbell 423: barber chair 424: barbershop 425: barn 426: barometer 427: barrel 428: wheelbarrow 429: baseball 430: basketball 431: bassinet 432: bassoon 433: swimming cap 434: bath towel 435: bathtub 436: station wagon 437: lighthouse 438: beaker 439: military cap 440: beer bottle 441: beer glass 442: bell-cot 443: bib 444: tandem bicycle 445: bikini 446: ring binder 447: binoculars 448: birdhouse 449: boathouse 450: bobsleigh 451: bolo tie 452: poke bonnet 453: bookcase 454: bookstore 455: bottle cap 456: bow 457: bow tie 458: brass 459: bra 460: breakwater 461: breastplate 462: broom 463: bucket 464: buckle 465: bulletproof vest 466: high-speed train 467: butcher shop 468: taxicab 469: cauldron 470: candle 471: cannon 472: canoe 473: can opener 474: cardigan 475: car mirror 476: carousel 477: tool kit 478: carton 479: car wheel 480: automated teller machine 481: cassette 482: cassette player 483: castle 484: catamaran 485: CD player 486: cello 487: mobile phone 488: chain 489: chain-link fence 490: chain mail 491: chainsaw 492: chest 493: chiffonier 494: chime 495: china cabinet 496: Christmas stocking 497: church 498: movie theater 499: cleaver 500: cliff dwelling 501: cloak 502: clogs 503: cocktail shaker 504: coffee mug 505: coffeemaker 506: coil 507: combination lock 508: computer keyboard 509: confectionery store 510: container ship 511: convertible 512: corkscrew 513: cornet 514: cowboy boot 515: cowboy hat 516: cradle 517: crane (machine) 518: crash helmet 519: crate 520: infant bed 521: Crock Pot 522: croquet ball 523: crutch 524: cuirass 525: dam 526: desk 527: desktop computer 528: rotary dial telephone 529: diaper 530: digital clock 531: digital watch 532: dining table 533: dishcloth 534: dishwasher 535: disc brake 536: dock 537: dog sled 538: dome 539: doormat 540: drilling rig 541: drum 542: drumstick 543: dumbbell 544: Dutch oven 545: electric fan 546: electric guitar 547: electric locomotive 548: entertainment center 549: envelope 550: espresso machine 551: face powder 552: feather boa 553: filing cabinet 554: fireboat 555: fire engine 556: fire screen sheet 557: flagpole 558: flute 559: folding chair 560: football helmet 561: forklift 562: fountain 563: fountain pen 564: four-poster bed 565: freight car 566: French horn 567: frying pan 568: fur coat 569: garbage truck 570: gas mask 571: gas pump 572: goblet 573: go-kart 574: golf ball 575: golf cart 576: gondola 577: gong 578: gown 579: grand piano 580: greenhouse 581: grille 582: grocery store 583: guillotine 584: barrette 585: hair spray 586: half-track 587: hammer 588: hamper 589: hair dryer 590: hand-held computer 591: handkerchief 592: hard disk drive 593: harmonica 594: harp 595: harvester 596: hatchet 597: holster 598: home theater 599: honeycomb 600: hook 601: hoop skirt 602: horizontal bar 603: horse-drawn vehicle 604: hourglass 605: iPod 606: clothes iron 607: jack-o'-lantern 608: jeans 609: jeep 610: T-shirt 611: jigsaw puzzle 612: pulled rickshaw 613: joystick 614: kimono 615: knee pad 616: knot 617: lab coat 618: ladle 619: lampshade 620: laptop computer 621: lawn mower 622: lens cap 623: paper knife 624: library 625: lifeboat 626: lighter 627: limousine 628: ocean liner 629: lipstick 630: slip-on shoe 631: lotion 632: speaker 633: loupe 634: sawmill 635: magnetic compass 636: mail bag 637: mailbox 638: tights 639: tank suit 640: manhole cover 641: maraca 642: marimba 643: mask 644: match 645: maypole 646: maze 647: measuring cup 648: medicine chest 649: megalith 650: microphone 651: microwave oven 652: military uniform 653: milk can 654: minibus 655: miniskirt 656: minivan 657: missile 658: mitten 659: mixing bowl 660: mobile home 661: Model T 662: modem 663: monastery 664: monitor 665: moped 666: mortar 667: square academic cap 668: mosque 669: mosquito net 670: scooter 671: mountain bike 672: tent 673: computer mouse 674: mousetrap 675: moving van 676: muzzle 677: nail 678: neck brace 679: necklace 680: nipple 681: notebook computer 682: obelisk 683: oboe 684: ocarina 685: odometer 686: oil filter 687: organ 688: oscilloscope 689: overskirt 690: bullock cart 691: oxygen mask 692: packet 693: paddle 694: paddle wheel 695: padlock 696: paintbrush 697: pajamas 698: palace 699: pan flute 700: paper towel 701: parachute 702: parallel bars 703: park bench 704: parking meter 705: passenger car 706: patio 707: payphone 708: pedestal 709: pencil case 710: pencil sharpener 711: perfume 712: Petri dish 713: photocopier 714: plectrum 715: Pickelhaube 716: picket fence 717: pickup truck 718: pier 719: piggy bank 720: pill bottle 721: pillow 722: ping-pong ball 723: pinwheel 724: pirate ship 725: pitcher 726: hand plane 727: planetarium 728: plastic bag 729: plate rack 730: plow 731: plunger 732: Polaroid camera 733: pole 734: police van 735: poncho 736: billiard table 737: soda bottle 738: pot 739: potter's wheel 740: power drill 741: prayer rug 742: printer 743: prison 744: projectile 745: projector 746: hockey puck 747: punching bag 748: purse 749: quill 750: quilt 751: race car 752: racket 753: radiator 754: radio 755: radio telescope 756: rain barrel 757: recreational vehicle 758: reel 759: reflex camera 760: refrigerator 761: remote control 762: restaurant 763: revolver 764: rifle 765: rocking chair 766: rotisserie 767: eraser 768: rugby ball 769: ruler 770: running shoe 771: safe 772: safety pin 773: salt shaker 774: sandal 775: sarong 776: saxophone 777: scabbard 778: weighing scale 779: school bus 780: schooner 781: scoreboard 782: CRT screen 783: screw 784: screwdriver 785: seat belt 786: sewing machine 787: shield 788: shoe store 789: shoji 790: shopping basket 791: shopping cart 792: shovel 793: shower cap 794: shower curtain 795: ski 796: ski mask 797: sleeping bag 798: slide rule 799: sliding door 800: slot machine 801: snorkel 802: snowmobile 803: snowplow 804: soap dispenser 805: soccer ball 806: sock 807: solar thermal collector 808: sombrero 809: soup bowl 810: space bar 811: space heater 812: space shuttle 813: spatula 814: motorboat 815: spider web 816: spindle 817: sports car 818: spotlight 819: stage 820: steam locomotive 821: through arch bridge 822: steel drum 823: stethoscope 824: scarf 825: stone wall 826: stopwatch 827: stove 828: strainer 829: tram 830: stretcher 831: couch 832: stupa 833: submarine 834: suit 835: sundial 836: sunglass 837: sunglasses 838: sunscreen 839: suspension bridge 840: mop 841: sweatshirt 842: swimsuit 843: swing 844: switch 845: syringe 846: table lamp 847: tank 848: tape player 849: teapot 850: teddy bear 851: television 852: tennis ball 853: thatched roof 854: front curtain 855: thimble 856: threshing machine 857: throne 858: tile roof 859: toaster 860: tobacco shop 861: toilet seat 862: torch 863: totem pole 864: tow truck 865: toy store 866: tractor 867: semi-trailer truck 868: tray 869: trench coat 870: tricycle 871: trimaran 872: tripod 873: triumphal arch 874: trolleybus 875: trombone 876: tub 877: turnstile 878: typewriter keyboard 879: umbrella 880: unicycle 881: upright piano 882: vacuum cleaner 883: vase 884: vault 885: velvet 886: vending machine 887: vestment 888: viaduct 889: violin 890: volleyball 891: waffle iron 892: wall clock 893: wallet 894: wardrobe 895: military aircraft 896: sink 897: washing machine 898: water bottle 899: water jug 900: water tower 901: whiskey jug 902: whistle 903: wig 904: window screen 905: window shade 906: Windsor tie 907: wine bottle 908: wing 909: wok 910: wooden spoon 911: wool 912: split-rail fence 913: shipwreck 914: yawl 915: yurt 916: website 917: comic book 918: crossword 919: traffic sign 920: traffic light 921: dust jacket 922: menu 923: plate 924: guacamole 925: consomme 926: hot pot 927: trifle 928: ice cream 929: ice pop 930: baguette 931: bagel 932: pretzel 933: cheeseburger 934: hot dog 935: mashed potato 936: cabbage 937: broccoli 938: cauliflower 939: zucchini 940: spaghetti squash 941: acorn squash 942: butternut squash 943: cucumber 944: artichoke 945: bell pepper 946: cardoon 947: mushroom 948: Granny Smith 949: strawberry 950: orange 951: lemon 952: fig 953: pineapple 954: banana 955: jackfruit 956: custard apple 957: pomegranate 958: hay 959: carbonara 960: chocolate syrup 961: dough 962: meatloaf 963: pizza 964: pot pie 965: burrito 966: red wine 967: espresso 968: cup 969: eggnog 970: alp 971: bubble 972: cliff 973: coral reef 974: geyser 975: lakeshore 976: promontory 977: shoal 978: seashore 979: valley 980: volcano 981: baseball player 982: bridegroom 983: scuba diver 984: rapeseed 985: daisy 986: yellow lady's slipper 987: corn 988: acorn 989: rose hip 990: horse chestnut seed 991: coral fungus 992: agaric 993: gyromitra 994: stinkhorn mushroom 995: earth star 996: hen-of-the-woods 997: bolete 998: ear 999: toilet paper # Download script/URL (optional) download: data/scripts/get_imagenet.sh ================================================ FILE: yolo-improve/yolov5-AUX/data/Objects365.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Objects365 dataset https://www.objects365.org/ by Megvii # Example usage: python train.py --data Objects365.yaml # parent # ├── yolov5 # └── datasets # └── Objects365 ← downloads here (712 GB = 367G data + 345G zips) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/Objects365 # dataset root dir train: images/train # train images (relative to 'path') 1742289 images val: images/val # val images (relative to 'path') 80000 images test: # test images (optional) # Classes names: 0: Person 1: Sneakers 2: Chair 3: Other Shoes 4: Hat 5: Car 6: Lamp 7: Glasses 8: Bottle 9: Desk 10: Cup 11: Street Lights 12: Cabinet/shelf 13: Handbag/Satchel 14: Bracelet 15: Plate 16: Picture/Frame 17: Helmet 18: Book 19: Gloves 20: Storage box 21: Boat 22: Leather Shoes 23: Flower 24: Bench 25: Potted Plant 26: Bowl/Basin 27: Flag 28: Pillow 29: Boots 30: Vase 31: Microphone 32: Necklace 33: Ring 34: SUV 35: Wine Glass 36: Belt 37: Monitor/TV 38: Backpack 39: Umbrella 40: Traffic Light 41: Speaker 42: Watch 43: Tie 44: Trash bin Can 45: Slippers 46: Bicycle 47: Stool 48: Barrel/bucket 49: Van 50: Couch 51: Sandals 52: Basket 53: Drum 54: Pen/Pencil 55: Bus 56: Wild Bird 57: High Heels 58: Motorcycle 59: Guitar 60: Carpet 61: Cell Phone 62: Bread 63: Camera 64: Canned 65: Truck 66: Traffic cone 67: Cymbal 68: Lifesaver 69: Towel 70: Stuffed Toy 71: Candle 72: Sailboat 73: Laptop 74: Awning 75: Bed 76: Faucet 77: Tent 78: Horse 79: Mirror 80: Power outlet 81: Sink 82: Apple 83: Air Conditioner 84: Knife 85: Hockey Stick 86: Paddle 87: Pickup Truck 88: Fork 89: Traffic Sign 90: Balloon 91: Tripod 92: Dog 93: Spoon 94: Clock 95: Pot 96: Cow 97: Cake 98: Dinning Table 99: Sheep 100: Hanger 101: Blackboard/Whiteboard 102: Napkin 103: Other Fish 104: Orange/Tangerine 105: Toiletry 106: Keyboard 107: Tomato 108: Lantern 109: Machinery Vehicle 110: Fan 111: Green Vegetables 112: Banana 113: Baseball Glove 114: Airplane 115: Mouse 116: Train 117: Pumpkin 118: Soccer 119: Skiboard 120: Luggage 121: Nightstand 122: Tea pot 123: Telephone 124: Trolley 125: Head Phone 126: Sports Car 127: Stop Sign 128: Dessert 129: Scooter 130: Stroller 131: Crane 132: Remote 133: Refrigerator 134: Oven 135: Lemon 136: Duck 137: Baseball Bat 138: Surveillance Camera 139: Cat 140: Jug 141: Broccoli 142: Piano 143: Pizza 144: Elephant 145: Skateboard 146: Surfboard 147: Gun 148: Skating and Skiing shoes 149: Gas stove 150: Donut 151: Bow Tie 152: Carrot 153: Toilet 154: Kite 155: Strawberry 156: Other Balls 157: Shovel 158: Pepper 159: Computer Box 160: Toilet Paper 161: Cleaning Products 162: Chopsticks 163: Microwave 164: Pigeon 165: Baseball 166: Cutting/chopping Board 167: Coffee Table 168: Side Table 169: Scissors 170: Marker 171: Pie 172: Ladder 173: Snowboard 174: Cookies 175: Radiator 176: Fire Hydrant 177: Basketball 178: Zebra 179: Grape 180: Giraffe 181: Potato 182: Sausage 183: Tricycle 184: Violin 185: Egg 186: Fire Extinguisher 187: Candy 188: Fire Truck 189: Billiards 190: Converter 191: Bathtub 192: Wheelchair 193: Golf Club 194: Briefcase 195: Cucumber 196: Cigar/Cigarette 197: Paint Brush 198: Pear 199: Heavy Truck 200: Hamburger 201: Extractor 202: Extension Cord 203: Tong 204: Tennis Racket 205: Folder 206: American Football 207: earphone 208: Mask 209: Kettle 210: Tennis 211: Ship 212: Swing 213: Coffee Machine 214: Slide 215: Carriage 216: Onion 217: Green beans 218: Projector 219: Frisbee 220: Washing Machine/Drying Machine 221: Chicken 222: Printer 223: Watermelon 224: Saxophone 225: Tissue 226: Toothbrush 227: Ice cream 228: Hot-air balloon 229: Cello 230: French Fries 231: Scale 232: Trophy 233: Cabbage 234: Hot dog 235: Blender 236: Peach 237: Rice 238: Wallet/Purse 239: Volleyball 240: Deer 241: Goose 242: Tape 243: Tablet 244: Cosmetics 245: Trumpet 246: Pineapple 247: Golf Ball 248: Ambulance 249: Parking meter 250: Mango 251: Key 252: Hurdle 253: Fishing Rod 254: Medal 255: Flute 256: Brush 257: Penguin 258: Megaphone 259: Corn 260: Lettuce 261: Garlic 262: Swan 263: Helicopter 264: Green Onion 265: Sandwich 266: Nuts 267: Speed Limit Sign 268: Induction Cooker 269: Broom 270: Trombone 271: Plum 272: Rickshaw 273: Goldfish 274: Kiwi fruit 275: Router/modem 276: Poker Card 277: Toaster 278: Shrimp 279: Sushi 280: Cheese 281: Notepaper 282: Cherry 283: Pliers 284: CD 285: Pasta 286: Hammer 287: Cue 288: Avocado 289: Hamimelon 290: Flask 291: Mushroom 292: Screwdriver 293: Soap 294: Recorder 295: Bear 296: Eggplant 297: Board Eraser 298: Coconut 299: Tape Measure/Ruler 300: Pig 301: Showerhead 302: Globe 303: Chips 304: Steak 305: Crosswalk Sign 306: Stapler 307: Camel 308: Formula 1 309: Pomegranate 310: Dishwasher 311: Crab 312: Hoverboard 313: Meat ball 314: Rice Cooker 315: Tuba 316: Calculator 317: Papaya 318: Antelope 319: Parrot 320: Seal 321: Butterfly 322: Dumbbell 323: Donkey 324: Lion 325: Urinal 326: Dolphin 327: Electric Drill 328: Hair Dryer 329: Egg tart 330: Jellyfish 331: Treadmill 332: Lighter 333: Grapefruit 334: Game board 335: Mop 336: Radish 337: Baozi 338: Target 339: French 340: Spring Rolls 341: Monkey 342: Rabbit 343: Pencil Case 344: Yak 345: Red Cabbage 346: Binoculars 347: Asparagus 348: Barbell 349: Scallop 350: Noddles 351: Comb 352: Dumpling 353: Oyster 354: Table Tennis paddle 355: Cosmetics Brush/Eyeliner Pencil 356: Chainsaw 357: Eraser 358: Lobster 359: Durian 360: Okra 361: Lipstick 362: Cosmetics Mirror 363: Curling 364: Table Tennis # Download script/URL (optional) --------------------------------------------------------------------------------------- download: | from tqdm import tqdm from utils.general import Path, check_requirements, download, np, xyxy2xywhn check_requirements(('pycocotools>=2.0',)) from pycocotools.coco import COCO # Make Directories dir = Path(yaml['path']) # dataset root dir for p in 'images', 'labels': (dir / p).mkdir(parents=True, exist_ok=True) for q in 'train', 'val': (dir / p / q).mkdir(parents=True, exist_ok=True) # Train, Val Splits for split, patches in [('train', 50 + 1), ('val', 43 + 1)]: print(f"Processing {split} in {patches} patches ...") images, labels = dir / 'images' / split, dir / 'labels' / split # Download url = f"https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/{split}/" if split == 'train': download([f'{url}zhiyuan_objv2_{split}.tar.gz'], dir=dir, delete=False) # annotations json download([f'{url}patch{i}.tar.gz' for i in range(patches)], dir=images, curl=True, delete=False, threads=8) elif split == 'val': download([f'{url}zhiyuan_objv2_{split}.json'], dir=dir, delete=False) # annotations json download([f'{url}images/v1/patch{i}.tar.gz' for i in range(15 + 1)], dir=images, curl=True, delete=False, threads=8) download([f'{url}images/v2/patch{i}.tar.gz' for i in range(16, patches)], dir=images, curl=True, delete=False, threads=8) # Move for f in tqdm(images.rglob('*.jpg'), desc=f'Moving {split} images'): f.rename(images / f.name) # move to /images/{split} # Labels coco = COCO(dir / f'zhiyuan_objv2_{split}.json') names = [x["name"] for x in coco.loadCats(coco.getCatIds())] for cid, cat in enumerate(names): catIds = coco.getCatIds(catNms=[cat]) imgIds = coco.getImgIds(catIds=catIds) for im in tqdm(coco.loadImgs(imgIds), desc=f'Class {cid + 1}/{len(names)} {cat}'): width, height = im["width"], im["height"] path = Path(im["file_name"]) # image filename try: with open(labels / path.with_suffix('.txt').name, 'a') as file: annIds = coco.getAnnIds(imgIds=im["id"], catIds=catIds, iscrowd=None) for a in coco.loadAnns(annIds): x, y, w, h = a['bbox'] # bounding box in xywh (xy top-left corner) xyxy = np.array([x, y, x + w, y + h])[None] # pixels(1,4) x, y, w, h = xyxy2xywhn(xyxy, w=width, h=height, clip=True)[0] # normalized and clipped file.write(f"{cid} {x:.5f} {y:.5f} {w:.5f} {h:.5f}\n") except Exception as e: print(e) ================================================ FILE: yolo-improve/yolov5-AUX/data/SKU-110K.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # SKU-110K retail items dataset https://github.com/eg4000/SKU110K_CVPR19 by Trax Retail # Example usage: python train.py --data SKU-110K.yaml # parent # ├── yolov5 # └── datasets # └── SKU-110K ← downloads here (13.6 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/SKU-110K # dataset root dir train: train.txt # train images (relative to 'path') 8219 images val: val.txt # val images (relative to 'path') 588 images test: test.txt # test images (optional) 2936 images # Classes names: 0: object # Download script/URL (optional) --------------------------------------------------------------------------------------- download: | import shutil from tqdm import tqdm from utils.general import np, pd, Path, download, xyxy2xywh # Download dir = Path(yaml['path']) # dataset root dir parent = Path(dir.parent) # download dir urls = ['http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz'] download(urls, dir=parent, delete=False) # Rename directories if dir.exists(): shutil.rmtree(dir) (parent / 'SKU110K_fixed').rename(dir) # rename dir (dir / 'labels').mkdir(parents=True, exist_ok=True) # create labels dir # Convert labels names = 'image', 'x1', 'y1', 'x2', 'y2', 'class', 'image_width', 'image_height' # column names for d in 'annotations_train.csv', 'annotations_val.csv', 'annotations_test.csv': x = pd.read_csv(dir / 'annotations' / d, names=names).values # annotations images, unique_images = x[:, 0], np.unique(x[:, 0]) with open((dir / d).with_suffix('.txt').__str__().replace('annotations_', ''), 'w') as f: f.writelines(f'./images/{s}\n' for s in unique_images) for im in tqdm(unique_images, desc=f'Converting {dir / d}'): cls = 0 # single-class dataset with open((dir / 'labels' / im).with_suffix('.txt'), 'a') as f: for r in x[images == im]: w, h = r[6], r[7] # image width, height xywh = xyxy2xywh(np.array([[r[1] / w, r[2] / h, r[3] / w, r[4] / h]]))[0] # instance f.write(f"{cls} {xywh[0]:.5f} {xywh[1]:.5f} {xywh[2]:.5f} {xywh[3]:.5f}\n") # write label ================================================ FILE: yolo-improve/yolov5-AUX/data/VOC.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC by University of Oxford # Example usage: python train.py --data VOC.yaml # parent # ├── yolov5 # └── datasets # └── VOC ← downloads here (2.8 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/VOC train: # train images (relative to 'path') 16551 images - images/train2012 - images/train2007 - images/val2012 - images/val2007 val: # val images (relative to 'path') 4952 images - images/test2007 test: # test images (optional) - images/test2007 # Classes names: 0: aeroplane 1: bicycle 2: bird 3: boat 4: bottle 5: bus 6: car 7: cat 8: chair 9: cow 10: diningtable 11: dog 12: horse 13: motorbike 14: person 15: pottedplant 16: sheep 17: sofa 18: train 19: tvmonitor # Download script/URL (optional) --------------------------------------------------------------------------------------- download: | import xml.etree.ElementTree as ET from tqdm import tqdm from utils.general import download, Path def convert_label(path, lb_path, year, image_id): def convert_box(size, box): dw, dh = 1. / size[0], 1. / size[1] x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2] return x * dw, y * dh, w * dw, h * dh in_file = open(path / f'VOC{year}/Annotations/{image_id}.xml') out_file = open(lb_path, 'w') tree = ET.parse(in_file) root = tree.getroot() size = root.find('size') w = int(size.find('width').text) h = int(size.find('height').text) names = list(yaml['names'].values()) # names list for obj in root.iter('object'): cls = obj.find('name').text if cls in names and int(obj.find('difficult').text) != 1: xmlbox = obj.find('bndbox') bb = convert_box((w, h), [float(xmlbox.find(x).text) for x in ('xmin', 'xmax', 'ymin', 'ymax')]) cls_id = names.index(cls) # class id out_file.write(" ".join([str(a) for a in (cls_id, *bb)]) + '\n') # Download dir = Path(yaml['path']) # dataset root dir url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/' urls = [f'{url}VOCtrainval_06-Nov-2007.zip', # 446MB, 5012 images f'{url}VOCtest_06-Nov-2007.zip', # 438MB, 4953 images f'{url}VOCtrainval_11-May-2012.zip'] # 1.95GB, 17126 images download(urls, dir=dir / 'images', delete=False, curl=True, threads=3) # Convert path = dir / 'images/VOCdevkit' for year, image_set in ('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test'): imgs_path = dir / 'images' / f'{image_set}{year}' lbs_path = dir / 'labels' / f'{image_set}{year}' imgs_path.mkdir(exist_ok=True, parents=True) lbs_path.mkdir(exist_ok=True, parents=True) with open(path / f'VOC{year}/ImageSets/Main/{image_set}.txt') as f: image_ids = f.read().strip().split() for id in tqdm(image_ids, desc=f'{image_set}{year}'): f = path / f'VOC{year}/JPEGImages/{id}.jpg' # old img path lb_path = (lbs_path / f.name).with_suffix('.txt') # new label path f.rename(imgs_path / f.name) # move image convert_label(path, lb_path, year, id) # convert labels to YOLO format ================================================ FILE: yolo-improve/yolov5-AUX/data/VisDrone.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # VisDrone2019-DET dataset https://github.com/VisDrone/VisDrone-Dataset by Tianjin University # Example usage: python train.py --data VisDrone.yaml # parent # ├── yolov5 # └── datasets # └── VisDrone ← downloads here (2.3 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/VisDrone # dataset root dir train: VisDrone2019-DET-train/images # train images (relative to 'path') 6471 images val: VisDrone2019-DET-val/images # val images (relative to 'path') 548 images test: VisDrone2019-DET-test-dev/images # test images (optional) 1610 images # Classes names: 0: pedestrian 1: people 2: bicycle 3: car 4: van 5: truck 6: tricycle 7: awning-tricycle 8: bus 9: motor # Download script/URL (optional) --------------------------------------------------------------------------------------- download: | from utils.general import download, os, Path def visdrone2yolo(dir): from PIL import Image from tqdm import tqdm def convert_box(size, box): # Convert VisDrone box to YOLO xywh box dw = 1. / size[0] dh = 1. / size[1] return (box[0] + box[2] / 2) * dw, (box[1] + box[3] / 2) * dh, box[2] * dw, box[3] * dh (dir / 'labels').mkdir(parents=True, exist_ok=True) # make labels directory pbar = tqdm((dir / 'annotations').glob('*.txt'), desc=f'Converting {dir}') for f in pbar: img_size = Image.open((dir / 'images' / f.name).with_suffix('.jpg')).size lines = [] with open(f, 'r') as file: # read annotation.txt for row in [x.split(',') for x in file.read().strip().splitlines()]: if row[4] == '0': # VisDrone 'ignored regions' class 0 continue cls = int(row[5]) - 1 box = convert_box(img_size, tuple(map(int, row[:4]))) lines.append(f"{cls} {' '.join(f'{x:.6f}' for x in box)}\n") with open(str(f).replace(os.sep + 'annotations' + os.sep, os.sep + 'labels' + os.sep), 'w') as fl: fl.writelines(lines) # write label.txt # Download dir = Path(yaml['path']) # dataset root dir urls = ['https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-train.zip', 'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-val.zip', 'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-test-dev.zip', 'https://github.com/ultralytics/yolov5/releases/download/v1.0/VisDrone2019-DET-test-challenge.zip'] download(urls, dir=dir, curl=True, threads=4) # Convert for d in 'VisDrone2019-DET-train', 'VisDrone2019-DET-val', 'VisDrone2019-DET-test-dev': visdrone2yolo(dir / d) # convert VisDrone annotations to YOLO labels ================================================ FILE: yolo-improve/yolov5-AUX/data/coco.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # COCO 2017 dataset http://cocodataset.org by Microsoft # Example usage: python train.py --data coco.yaml # parent # ├── yolov5 # └── datasets # └── coco ← downloads here (20.1 GB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/coco # dataset root dir train: train2017.txt # train images (relative to 'path') 118287 images val: val2017.txt # val images (relative to 'path') 5000 images test: test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794 # Classes names: 0: person 1: bicycle 2: car 3: motorcycle 4: airplane 5: bus 6: train 7: truck 8: boat 9: traffic light 10: fire hydrant 11: stop sign 12: parking meter 13: bench 14: bird 15: cat 16: dog 17: horse 18: sheep 19: cow 20: elephant 21: bear 22: zebra 23: giraffe 24: backpack 25: umbrella 26: handbag 27: tie 28: suitcase 29: frisbee 30: skis 31: snowboard 32: sports ball 33: kite 34: baseball bat 35: baseball glove 36: skateboard 37: surfboard 38: tennis racket 39: bottle 40: wine glass 41: cup 42: fork 43: knife 44: spoon 45: bowl 46: banana 47: apple 48: sandwich 49: orange 50: broccoli 51: carrot 52: hot dog 53: pizza 54: donut 55: cake 56: chair 57: couch 58: potted plant 59: bed 60: dining table 61: toilet 62: tv 63: laptop 64: mouse 65: remote 66: keyboard 67: cell phone 68: microwave 69: oven 70: toaster 71: sink 72: refrigerator 73: book 74: clock 75: vase 76: scissors 77: teddy bear 78: hair drier 79: toothbrush # Download script/URL (optional) download: | from utils.general import download, Path # Download labels segments = False # segment or box labels dir = Path(yaml['path']) # dataset root dir url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/' urls = [url + ('coco2017labels-segments.zip' if segments else 'coco2017labels.zip')] # labels download(urls, dir=dir.parent) # Download data urls = ['http://images.cocodataset.org/zips/train2017.zip', # 19G, 118k images 'http://images.cocodataset.org/zips/val2017.zip', # 1G, 5k images 'http://images.cocodataset.org/zips/test2017.zip'] # 7G, 41k images (optional) download(urls, dir=dir / 'images', threads=3) ================================================ FILE: yolo-improve/yolov5-AUX/data/coco128-seg.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # COCO128-seg dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics # Example usage: python train.py --data coco128.yaml # parent # ├── yolov5 # └── datasets # └── coco128-seg ← downloads here (7 MB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/coco128-seg # dataset root dir train: images/train2017 # train images (relative to 'path') 128 images val: images/train2017 # val images (relative to 'path') 128 images test: # test images (optional) # Classes names: 0: person 1: bicycle 2: car 3: motorcycle 4: airplane 5: bus 6: train 7: truck 8: boat 9: traffic light 10: fire hydrant 11: stop sign 12: parking meter 13: bench 14: bird 15: cat 16: dog 17: horse 18: sheep 19: cow 20: elephant 21: bear 22: zebra 23: giraffe 24: backpack 25: umbrella 26: handbag 27: tie 28: suitcase 29: frisbee 30: skis 31: snowboard 32: sports ball 33: kite 34: baseball bat 35: baseball glove 36: skateboard 37: surfboard 38: tennis racket 39: bottle 40: wine glass 41: cup 42: fork 43: knife 44: spoon 45: bowl 46: banana 47: apple 48: sandwich 49: orange 50: broccoli 51: carrot 52: hot dog 53: pizza 54: donut 55: cake 56: chair 57: couch 58: potted plant 59: bed 60: dining table 61: toilet 62: tv 63: laptop 64: mouse 65: remote 66: keyboard 67: cell phone 68: microwave 69: oven 70: toaster 71: sink 72: refrigerator 73: book 74: clock 75: vase 76: scissors 77: teddy bear 78: hair drier 79: toothbrush # Download script/URL (optional) download: https://ultralytics.com/assets/coco128-seg.zip ================================================ FILE: yolo-improve/yolov5-AUX/data/coco128.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # COCO128 dataset https://www.kaggle.com/ultralytics/coco128 (first 128 images from COCO train2017) by Ultralytics # Example usage: python train.py --data coco128.yaml # parent # ├── yolov5 # └── datasets # └── coco128 ← downloads here (7 MB) # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] path: ../datasets/coco128 # dataset root dir train: images/train2017 # train images (relative to 'path') 128 images val: images/train2017 # val images (relative to 'path') 128 images test: # test images (optional) # Classes names: 0: person 1: bicycle 2: car 3: motorcycle 4: airplane 5: bus 6: train 7: truck 8: boat 9: traffic light 10: fire hydrant 11: stop sign 12: parking meter 13: bench 14: bird 15: cat 16: dog 17: horse 18: sheep 19: cow 20: elephant 21: bear 22: zebra 23: giraffe 24: backpack 25: umbrella 26: handbag 27: tie 28: suitcase 29: frisbee 30: skis 31: snowboard 32: sports ball 33: kite 34: baseball bat 35: baseball glove 36: skateboard 37: surfboard 38: tennis racket 39: bottle 40: wine glass 41: cup 42: fork 43: knife 44: spoon 45: bowl 46: banana 47: apple 48: sandwich 49: orange 50: broccoli 51: carrot 52: hot dog 53: pizza 54: donut 55: cake 56: chair 57: couch 58: potted plant 59: bed 60: dining table 61: toilet 62: tv 63: laptop 64: mouse 65: remote 66: keyboard 67: cell phone 68: microwave 69: oven 70: toaster 71: sink 72: refrigerator 73: book 74: clock 75: vase 76: scissors 77: teddy bear 78: hair drier 79: toothbrush # Download script/URL (optional) download: https://ultralytics.com/assets/coco128.zip ================================================ FILE: yolo-improve/yolov5-AUX/data/hyps/hyp.Objects365.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Hyperparameters for Objects365 training # python train.py --weights yolov5m.pt --data Objects365.yaml --evolve # See Hyperparameter Evolution tutorial for details https://github.com/ultralytics/yolov5#tutorials lr0: 0.00258 lrf: 0.17 momentum: 0.779 weight_decay: 0.00058 warmup_epochs: 1.33 warmup_momentum: 0.86 warmup_bias_lr: 0.0711 box: 0.0539 cls: 0.299 cls_pw: 0.825 obj: 0.632 obj_pw: 1.0 iou_t: 0.2 anchor_t: 3.44 anchors: 3.2 fl_gamma: 0.0 hsv_h: 0.0188 hsv_s: 0.704 hsv_v: 0.36 degrees: 0.0 translate: 0.0902 scale: 0.491 shear: 0.0 perspective: 0.0 flipud: 0.0 fliplr: 0.5 mosaic: 1.0 mixup: 0.0 copy_paste: 0.0 ================================================ FILE: yolo-improve/yolov5-AUX/data/hyps/hyp.VOC.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Hyperparameters for VOC training # python train.py --batch 128 --weights yolov5m6.pt --data VOC.yaml --epochs 50 --img 512 --hyp hyp.scratch-med.yaml --evolve # See Hyperparameter Evolution tutorial for details https://github.com/ultralytics/yolov5#tutorials # YOLOv5 Hyperparameter Evolution Results # Best generation: 467 # Last generation: 996 # metrics/precision, metrics/recall, metrics/mAP_0.5, metrics/mAP_0.5:0.95, val/box_loss, val/obj_loss, val/cls_loss # 0.87729, 0.85125, 0.91286, 0.72664, 0.0076739, 0.0042529, 0.0013865 lr0: 0.00334 lrf: 0.15135 momentum: 0.74832 weight_decay: 0.00025 warmup_epochs: 3.3835 warmup_momentum: 0.59462 warmup_bias_lr: 0.18657 box: 0.02 cls: 0.21638 cls_pw: 0.5 obj: 0.51728 obj_pw: 0.67198 iou_t: 0.2 anchor_t: 3.3744 fl_gamma: 0.0 hsv_h: 0.01041 hsv_s: 0.54703 hsv_v: 0.27739 degrees: 0.0 translate: 0.04591 scale: 0.75544 shear: 0.0 perspective: 0.0 flipud: 0.0 fliplr: 0.5 mosaic: 0.85834 mixup: 0.04266 copy_paste: 0.0 anchors: 3.412 ================================================ FILE: yolo-improve/yolov5-AUX/data/hyps/hyp.no-augmentation.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Hyperparameters when using Albumentations frameworks # python train.py --hyp hyp.no-augmentation.yaml # See https://github.com/ultralytics/yolov5/pull/3882 for YOLOv5 + Albumentations Usage examples lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3) lrf: 0.1 # final OneCycleLR learning rate (lr0 * lrf) momentum: 0.937 # SGD momentum/Adam beta1 weight_decay: 0.0005 # optimizer weight decay 5e-4 warmup_epochs: 3.0 # warmup epochs (fractions ok) warmup_momentum: 0.8 # warmup initial momentum warmup_bias_lr: 0.1 # warmup initial bias lr box: 0.05 # box loss gain cls: 0.3 # cls loss gain cls_pw: 1.0 # cls BCELoss positive_weight obj: 0.7 # obj loss gain (scale with pixels) obj_pw: 1.0 # obj BCELoss positive_weight iou_t: 0.20 # IoU training threshold anchor_t: 4.0 # anchor-multiple threshold # anchors: 3 # anchors per output layer (0 to ignore) # this parameters are all zero since we want to use albumentation framework fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5) hsv_h: 0 # image HSV-Hue augmentation (fraction) hsv_s: 00 # image HSV-Saturation augmentation (fraction) hsv_v: 0 # image HSV-Value augmentation (fraction) degrees: 0.0 # image rotation (+/- deg) translate: 0 # image translation (+/- fraction) scale: 0 # image scale (+/- gain) shear: 0 # image shear (+/- deg) perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 flipud: 0.0 # image flip up-down (probability) fliplr: 0.0 # image flip left-right (probability) mosaic: 0.0 # image mosaic (probability) mixup: 0.0 # image mixup (probability) copy_paste: 0.0 # segment copy-paste (probability) ================================================ FILE: yolo-improve/yolov5-AUX/data/hyps/hyp.scratch-high.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Hyperparameters for high-augmentation COCO training from scratch # python train.py --batch 32 --cfg yolov5m6.yaml --weights '' --data coco.yaml --img 1280 --epochs 300 # See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3) lrf: 0.1 # final OneCycleLR learning rate (lr0 * lrf) momentum: 0.937 # SGD momentum/Adam beta1 weight_decay: 0.0005 # optimizer weight decay 5e-4 warmup_epochs: 3.0 # warmup epochs (fractions ok) warmup_momentum: 0.8 # warmup initial momentum warmup_bias_lr: 0.1 # warmup initial bias lr box: 0.05 # box loss gain cls: 0.3 # cls loss gain cls_pw: 1.0 # cls BCELoss positive_weight obj: 0.7 # obj loss gain (scale with pixels) obj_pw: 1.0 # obj BCELoss positive_weight iou_t: 0.20 # IoU training threshold anchor_t: 4.0 # anchor-multiple threshold # anchors: 3 # anchors per output layer (0 to ignore) fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5) hsv_h: 0.015 # image HSV-Hue augmentation (fraction) hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) hsv_v: 0.4 # image HSV-Value augmentation (fraction) degrees: 0.0 # image rotation (+/- deg) translate: 0.1 # image translation (+/- fraction) scale: 0.9 # image scale (+/- gain) shear: 0.0 # image shear (+/- deg) perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 flipud: 0.0 # image flip up-down (probability) fliplr: 0.5 # image flip left-right (probability) mosaic: 1.0 # image mosaic (probability) mixup: 0.1 # image mixup (probability) copy_paste: 0.1 # segment copy-paste (probability) ================================================ FILE: yolo-improve/yolov5-AUX/data/hyps/hyp.scratch-low.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Hyperparameters for low-augmentation COCO training from scratch # python train.py --batch 64 --cfg yolov5n6.yaml --weights '' --data coco.yaml --img 640 --epochs 300 --linear # See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3) lrf: 0.01 # final OneCycleLR learning rate (lr0 * lrf) momentum: 0.937 # SGD momentum/Adam beta1 weight_decay: 0.0005 # optimizer weight decay 5e-4 warmup_epochs: 3.0 # warmup epochs (fractions ok) warmup_momentum: 0.8 # warmup initial momentum warmup_bias_lr: 0.1 # warmup initial bias lr box: 0.05 # box loss gain cls: 0.5 # cls loss gain cls_pw: 1.0 # cls BCELoss positive_weight obj: 1.0 # obj loss gain (scale with pixels) obj_pw: 1.0 # obj BCELoss positive_weight iou_t: 0.20 # IoU training threshold anchor_t: 4.0 # anchor-multiple threshold # anchors: 3 # anchors per output layer (0 to ignore) fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5) hsv_h: 0.015 # image HSV-Hue augmentation (fraction) hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) hsv_v: 0.4 # image HSV-Value augmentation (fraction) degrees: 0.0 # image rotation (+/- deg) translate: 0.1 # image translation (+/- fraction) scale: 0.5 # image scale (+/- gain) shear: 0.0 # image shear (+/- deg) perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 flipud: 0.0 # image flip up-down (probability) fliplr: 0.5 # image flip left-right (probability) mosaic: 1.0 # image mosaic (probability) mixup: 0.0 # image mixup (probability) copy_paste: 0.0 # segment copy-paste (probability) ================================================ FILE: yolo-improve/yolov5-AUX/data/hyps/hyp.scratch-med.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Hyperparameters for medium-augmentation COCO training from scratch # python train.py --batch 32 --cfg yolov5m6.yaml --weights '' --data coco.yaml --img 1280 --epochs 300 # See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3) lrf: 0.1 # final OneCycleLR learning rate (lr0 * lrf) momentum: 0.937 # SGD momentum/Adam beta1 weight_decay: 0.0005 # optimizer weight decay 5e-4 warmup_epochs: 3.0 # warmup epochs (fractions ok) warmup_momentum: 0.8 # warmup initial momentum warmup_bias_lr: 0.1 # warmup initial bias lr box: 0.05 # box loss gain cls: 0.3 # cls loss gain cls_pw: 1.0 # cls BCELoss positive_weight obj: 0.7 # obj loss gain (scale with pixels) obj_pw: 1.0 # obj BCELoss positive_weight iou_t: 0.20 # IoU training threshold anchor_t: 4.0 # anchor-multiple threshold # anchors: 3 # anchors per output layer (0 to ignore) fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5) hsv_h: 0.015 # image HSV-Hue augmentation (fraction) hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) hsv_v: 0.4 # image HSV-Value augmentation (fraction) degrees: 0.0 # image rotation (+/- deg) translate: 0.1 # image translation (+/- fraction) scale: 0.9 # image scale (+/- gain) shear: 0.0 # image shear (+/- deg) perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 flipud: 0.0 # image flip up-down (probability) fliplr: 0.5 # image flip left-right (probability) mosaic: 1.0 # image mosaic (probability) mixup: 0.1 # image mixup (probability) copy_paste: 0.0 # segment copy-paste (probability) ================================================ FILE: yolo-improve/yolov5-AUX/data/scripts/download_weights.sh ================================================ #!/bin/bash # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Download latest models from https://github.com/ultralytics/yolov5/releases # Example usage: bash data/scripts/download_weights.sh # parent # └── yolov5 # ├── yolov5s.pt ← downloads here # ├── yolov5m.pt # └── ... python - <= cls >= 0, f'incorrect class index {cls}' # Write YOLO label if id not in shapes: shapes[id] = Image.open(file).size box = xyxy2xywhn(box[None].astype(np.float), w=shapes[id][0], h=shapes[id][1], clip=True) with open((labels / id).with_suffix('.txt'), 'a') as f: f.write(f"{cls} {' '.join(f'{x:.6f}' for x in box[0])}\n") # write label.txt except Exception as e: print(f'WARNING: skipping one label for {file}: {e}') # Download manually from https://challenge.xviewdataset.org dir = Path(yaml['path']) # dataset root dir # urls = ['https://d307kc0mrhucc3.cloudfront.net/train_labels.zip', # train labels # 'https://d307kc0mrhucc3.cloudfront.net/train_images.zip', # 15G, 847 train images # 'https://d307kc0mrhucc3.cloudfront.net/val_images.zip'] # 5G, 282 val images (no labels) # download(urls, dir=dir, delete=False) # Convert labels convert_labels(dir / 'xView_train.geojson') # Move images images = Path(dir / 'images') images.mkdir(parents=True, exist_ok=True) Path(dir / 'train_images').rename(dir / 'images' / 'train') Path(dir / 'val_images').rename(dir / 'images' / 'val') # Split autosplit(dir / 'images' / 'train') ================================================ FILE: yolo-improve/yolov5-AUX/detect.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Run YOLOv5 detection inference on images, videos, directories, globs, YouTube, webcam, streams, etc. Usage - sources: $ python detect.py --weights yolov5s.pt --source 0 # webcam img.jpg # image vid.mp4 # video screen # screenshot path/ # directory list.txt # list of images list.streams # list of streams 'path/*.jpg' # glob 'https://youtu.be/Zgi9g1ksQHc' # YouTube 'rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP stream Usage - formats: $ python detect.py --weights yolov5s.pt # PyTorch yolov5s.torchscript # TorchScript yolov5s.onnx # ONNX Runtime or OpenCV DNN with --dnn yolov5s_openvino_model # OpenVINO yolov5s.engine # TensorRT yolov5s.mlmodel # CoreML (macOS-only) yolov5s_saved_model # TensorFlow SavedModel yolov5s.pb # TensorFlow GraphDef yolov5s.tflite # TensorFlow Lite yolov5s_edgetpu.tflite # TensorFlow Edge TPU yolov5s_paddle_model # PaddlePaddle """ import argparse import os import platform import sys from pathlib import Path import torch FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative from models.common import DetectMultiBackend from utils.dataloaders import IMG_FORMATS, VID_FORMATS, LoadImages, LoadScreenshots, LoadStreams from utils.general import (LOGGER, Profile, check_file, check_img_size, check_imshow, check_requirements, colorstr, cv2, increment_path, non_max_suppression, print_args, scale_boxes, strip_optimizer, xyxy2xywh) from utils.plots import Annotator, colors, save_one_box from utils.torch_utils import select_device, smart_inference_mode @smart_inference_mode() def run( weights=ROOT / 'yolov5s.pt', # model path or triton URL source=ROOT / 'data/images', # file/dir/URL/glob/screen/0(webcam) data=ROOT / 'data/coco128.yaml', # dataset.yaml path imgsz=(640, 640), # inference size (height, width) conf_thres=0.25, # confidence threshold iou_thres=0.45, # NMS IOU threshold max_det=1000, # maximum detections per image device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu view_img=False, # show results save_txt=False, # save results to *.txt save_conf=False, # save confidences in --save-txt labels save_crop=False, # save cropped prediction boxes nosave=False, # do not save images/videos classes=None, # filter by class: --class 0, or --class 0 2 3 agnostic_nms=False, # class-agnostic NMS augment=False, # augmented inference visualize=False, # visualize features update=False, # update all models project=ROOT / 'runs/detect', # save results to project/name name='exp', # save results to project/name exist_ok=False, # existing project/name ok, do not increment line_thickness=3, # bounding box thickness (pixels) hide_labels=False, # hide labels hide_conf=False, # hide confidences half=False, # use FP16 half-precision inference dnn=False, # use OpenCV DNN for ONNX inference vid_stride=1, # video frame-rate stride ): source = str(source) save_img = not nosave and not source.endswith('.txt') # save inference images is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS) is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://')) webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file) screenshot = source.lower().startswith('screen') if is_url and is_file: source = check_file(source) # download # Directories save_dir = increment_path(Path(project) / name, exist_ok=exist_ok) # increment run (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir # Load model device = select_device(device) model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half) stride, names, pt = model.stride, model.names, model.pt imgsz = check_img_size(imgsz, s=stride) # check image size # Dataloader bs = 1 # batch_size if webcam: view_img = check_imshow(warn=True) dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride) bs = len(dataset) elif screenshot: dataset = LoadScreenshots(source, img_size=imgsz, stride=stride, auto=pt) else: dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride) vid_path, vid_writer = [None] * bs, [None] * bs # Run inference model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz)) # warmup seen, windows, dt = 0, [], (Profile(), Profile(), Profile()) for path, im, im0s, vid_cap, s in dataset: with dt[0]: im = torch.from_numpy(im).to(model.device) im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: im = im[None] # expand for batch dim # Inference with dt[1]: visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False pred = model(im, augment=augment, visualize=visualize) # NMS with dt[2]: pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) # Second-stage classifier (optional) # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s) # Process predictions for i, det in enumerate(pred): # per image seen += 1 if webcam: # batch_size >= 1 p, im0, frame = path[i], im0s[i].copy(), dataset.count s += f'{i}: ' else: p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0) p = Path(p) # to Path save_path = str(save_dir / p.name) # im.jpg txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}') # im.txt s += '%gx%g ' % im.shape[2:] # print string gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh imc = im0.copy() if save_crop else im0 # for save_crop annotator = Annotator(im0, line_width=line_thickness, example=str(names)) if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round() # Print results for c in det[:, 5].unique(): n = (det[:, 5] == c).sum() # detections per class s += f"{n} {names[int(c)]}{'s' * (n > 1)}, " # add to string # Write results for *xyxy, conf, cls in reversed(det): if save_txt: # Write to file xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format with open(f'{txt_path}.txt', 'a') as f: f.write(('%g ' * len(line)).rstrip() % line + '\n') if save_img or save_crop or view_img: # Add bbox to image c = int(cls) # integer class label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}') annotator.box_label(xyxy, label, color=colors(c, True)) if save_crop: save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True) # Stream results im0 = annotator.result() if view_img: if platform.system() == 'Linux' and p not in windows: windows.append(p) cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) # allow window resize (Linux) cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0]) cv2.imshow(str(p), im0) cv2.waitKey(1) # 1 millisecond # Save results (image with detections) if save_img: if dataset.mode == 'image': cv2.imwrite(save_path, im0) else: # 'video' or 'stream' if vid_path[i] != save_path: # new video vid_path[i] = save_path if isinstance(vid_writer[i], cv2.VideoWriter): vid_writer[i].release() # release previous video writer if vid_cap: # video fps = vid_cap.get(cv2.CAP_PROP_FPS) w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) else: # stream fps, w, h = 30, im0.shape[1], im0.shape[0] save_path = str(Path(save_path).with_suffix('.mp4')) # force *.mp4 suffix on results videos vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h)) vid_writer[i].write(im0) # Print time (inference-only) LOGGER.info(f"{s}{'' if len(det) else '(no detections), '}{dt[1].dt * 1E3:.1f}ms") # Print results t = tuple(x.t / seen * 1E3 for x in dt) # speeds per image LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t) if save_txt or save_img: s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else '' LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}") if update: strip_optimizer(weights[0]) # update model (to fix SourceChangeWarning) def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path or triton URL') parser.add_argument('--source', type=str, default=ROOT / 'data/images', help='file/dir/URL/glob/screen/0(webcam)') parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='(optional) dataset.yaml path') parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w') parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold') parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold') parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--view-img', action='store_true', help='show results') parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes') parser.add_argument('--nosave', action='store_true', help='do not save images/videos') parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --classes 0, or --classes 0 2 3') parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') parser.add_argument('--augment', action='store_true', help='augmented inference') parser.add_argument('--visualize', action='store_true', help='visualize features') parser.add_argument('--update', action='store_true', help='update all models') parser.add_argument('--project', default=ROOT / 'runs/detect', help='save results to project/name') parser.add_argument('--name', default='exp', help='save results to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)') parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels') parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences') parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference') parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference') parser.add_argument('--vid-stride', type=int, default=1, help='video frame-rate stride') opt = parser.parse_args() opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand print_args(vars(opt)) return opt def main(opt): check_requirements(exclude=('tensorboard', 'thop')) run(**vars(opt)) if __name__ == '__main__': opt = parse_opt() main(opt) ================================================ FILE: yolo-improve/yolov5-AUX/export.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Export a YOLOv5 PyTorch model to other formats. TensorFlow exports authored by https://github.com/zldrobit Format | `export.py --include` | Model --- | --- | --- PyTorch | - | yolov5s.pt TorchScript | `torchscript` | yolov5s.torchscript ONNX | `onnx` | yolov5s.onnx OpenVINO | `openvino` | yolov5s_openvino_model/ TensorRT | `engine` | yolov5s.engine CoreML | `coreml` | yolov5s.mlmodel TensorFlow SavedModel | `saved_model` | yolov5s_saved_model/ TensorFlow GraphDef | `pb` | yolov5s.pb TensorFlow Lite | `tflite` | yolov5s.tflite TensorFlow Edge TPU | `edgetpu` | yolov5s_edgetpu.tflite TensorFlow.js | `tfjs` | yolov5s_web_model/ PaddlePaddle | `paddle` | yolov5s_paddle_model/ Requirements: $ pip install -r requirements.txt coremltools onnx onnx-simplifier onnxruntime openvino-dev tensorflow-cpu # CPU $ pip install -r requirements.txt coremltools onnx onnx-simplifier onnxruntime-gpu openvino-dev tensorflow # GPU Usage: $ python export.py --weights yolov5s.pt --include torchscript onnx openvino engine coreml tflite ... Inference: $ python detect.py --weights yolov5s.pt # PyTorch yolov5s.torchscript # TorchScript yolov5s.onnx # ONNX Runtime or OpenCV DNN with --dnn yolov5s_openvino_model # OpenVINO yolov5s.engine # TensorRT yolov5s.mlmodel # CoreML (macOS-only) yolov5s_saved_model # TensorFlow SavedModel yolov5s.pb # TensorFlow GraphDef yolov5s.tflite # TensorFlow Lite yolov5s_edgetpu.tflite # TensorFlow Edge TPU yolov5s_paddle_model # PaddlePaddle TensorFlow.js: $ cd .. && git clone https://github.com/zldrobit/tfjs-yolov5-example.git && cd tfjs-yolov5-example $ npm install $ ln -s ../../yolov5/yolov5s_web_model public/yolov5s_web_model $ npm start """ import argparse import contextlib import json import os import platform import re import subprocess import sys import time import warnings from pathlib import Path import pandas as pd import torch from torch.utils.mobile_optimizer import optimize_for_mobile FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH if platform.system() != 'Windows': ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative from models.experimental import attempt_load from models.yolo import ClassificationModel, Detect, DetectionModel, SegmentationModel from utils.dataloaders import LoadImages from utils.general import (LOGGER, Profile, check_dataset, check_img_size, check_requirements, check_version, check_yaml, colorstr, file_size, get_default_args, print_args, url2file, yaml_save) from utils.torch_utils import select_device, smart_inference_mode MACOS = platform.system() == 'Darwin' # macOS environment def export_formats(): # YOLOv5 export formats x = [ ['PyTorch', '-', '.pt', True, True], ['TorchScript', 'torchscript', '.torchscript', True, True], ['ONNX', 'onnx', '.onnx', True, True], ['OpenVINO', 'openvino', '_openvino_model', True, False], ['TensorRT', 'engine', '.engine', False, True], ['CoreML', 'coreml', '.mlmodel', True, False], ['TensorFlow SavedModel', 'saved_model', '_saved_model', True, True], ['TensorFlow GraphDef', 'pb', '.pb', True, True], ['TensorFlow Lite', 'tflite', '.tflite', True, False], ['TensorFlow Edge TPU', 'edgetpu', '_edgetpu.tflite', False, False], ['TensorFlow.js', 'tfjs', '_web_model', False, False], ['PaddlePaddle', 'paddle', '_paddle_model', True, True],] return pd.DataFrame(x, columns=['Format', 'Argument', 'Suffix', 'CPU', 'GPU']) def try_export(inner_func): # YOLOv5 export decorator, i..e @try_export inner_args = get_default_args(inner_func) def outer_func(*args, **kwargs): prefix = inner_args['prefix'] try: with Profile() as dt: f, model = inner_func(*args, **kwargs) LOGGER.info(f'{prefix} export success ✅ {dt.t:.1f}s, saved as {f} ({file_size(f):.1f} MB)') return f, model except Exception as e: LOGGER.info(f'{prefix} export failure ❌ {dt.t:.1f}s: {e}') return None, None return outer_func @try_export def export_torchscript(model, im, file, optimize, prefix=colorstr('TorchScript:')): # YOLOv5 TorchScript model export LOGGER.info(f'\n{prefix} starting export with torch {torch.__version__}...') f = file.with_suffix('.torchscript') ts = torch.jit.trace(model, im, strict=False) d = {'shape': im.shape, 'stride': int(max(model.stride)), 'names': model.names} extra_files = {'config.txt': json.dumps(d)} # torch._C.ExtraFilesMap() if optimize: # https://pytorch.org/tutorials/recipes/mobile_interpreter.html optimize_for_mobile(ts)._save_for_lite_interpreter(str(f), _extra_files=extra_files) else: ts.save(str(f), _extra_files=extra_files) return f, None @try_export def export_onnx(model, im, file, opset, dynamic, simplify, prefix=colorstr('ONNX:')): # YOLOv5 ONNX export check_requirements('onnx>=1.12.0') import onnx LOGGER.info(f'\n{prefix} starting export with onnx {onnx.__version__}...') f = file.with_suffix('.onnx') output_names = ['output0', 'output1'] if isinstance(model, SegmentationModel) else ['output0'] if dynamic: dynamic = {'images': {0: 'batch', 2: 'height', 3: 'width'}} # shape(1,3,640,640) if isinstance(model, SegmentationModel): dynamic['output0'] = {0: 'batch', 1: 'anchors'} # shape(1,25200,85) dynamic['output1'] = {0: 'batch', 2: 'mask_height', 3: 'mask_width'} # shape(1,32,160,160) elif isinstance(model, DetectionModel): dynamic['output0'] = {0: 'batch', 1: 'anchors'} # shape(1,25200,85) torch.onnx.export( model.cpu() if dynamic else model, # --dynamic only compatible with cpu im.cpu() if dynamic else im, f, verbose=False, opset_version=opset, do_constant_folding=True, # WARNING: DNN inference with torch>=1.12 may require do_constant_folding=False input_names=['images'], output_names=output_names, dynamic_axes=dynamic or None) # Checks model_onnx = onnx.load(f) # load onnx model onnx.checker.check_model(model_onnx) # check onnx model # Metadata d = {'stride': int(max(model.stride)), 'names': model.names} for k, v in d.items(): meta = model_onnx.metadata_props.add() meta.key, meta.value = k, str(v) onnx.save(model_onnx, f) # Simplify if simplify: try: cuda = torch.cuda.is_available() check_requirements(('onnxruntime-gpu' if cuda else 'onnxruntime', 'onnx-simplifier>=0.4.1')) import onnxsim LOGGER.info(f'{prefix} simplifying with onnx-simplifier {onnxsim.__version__}...') model_onnx, check = onnxsim.simplify(model_onnx) assert check, 'assert check failed' onnx.save(model_onnx, f) except Exception as e: LOGGER.info(f'{prefix} simplifier failure: {e}') return f, model_onnx @try_export def export_openvino(file, metadata, half, prefix=colorstr('OpenVINO:')): # YOLOv5 OpenVINO export check_requirements('openvino-dev') # requires openvino-dev: https://pypi.org/project/openvino-dev/ import openvino.inference_engine as ie LOGGER.info(f'\n{prefix} starting export with openvino {ie.__version__}...') f = str(file).replace('.pt', f'_openvino_model{os.sep}') args = [ 'mo', '--input_model', str(file.with_suffix('.onnx')), '--output_dir', f, '--data_type', ('FP16' if half else 'FP32'),] subprocess.run(args, check=True, env=os.environ) # export yaml_save(Path(f) / file.with_suffix('.yaml').name, metadata) # add metadata.yaml return f, None @try_export def export_paddle(model, im, file, metadata, prefix=colorstr('PaddlePaddle:')): # YOLOv5 Paddle export check_requirements(('paddlepaddle', 'x2paddle')) import x2paddle from x2paddle.convert import pytorch2paddle LOGGER.info(f'\n{prefix} starting export with X2Paddle {x2paddle.__version__}...') f = str(file).replace('.pt', f'_paddle_model{os.sep}') pytorch2paddle(module=model, save_dir=f, jit_type='trace', input_examples=[im]) # export yaml_save(Path(f) / file.with_suffix('.yaml').name, metadata) # add metadata.yaml return f, None @try_export def export_coreml(model, im, file, int8, half, prefix=colorstr('CoreML:')): # YOLOv5 CoreML export check_requirements('coremltools') import coremltools as ct LOGGER.info(f'\n{prefix} starting export with coremltools {ct.__version__}...') f = file.with_suffix('.mlmodel') ts = torch.jit.trace(model, im, strict=False) # TorchScript model ct_model = ct.convert(ts, inputs=[ct.ImageType('image', shape=im.shape, scale=1 / 255, bias=[0, 0, 0])]) bits, mode = (8, 'kmeans_lut') if int8 else (16, 'linear') if half else (32, None) if bits < 32: if MACOS: # quantization only supported on macOS with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) # suppress numpy==1.20 float warning ct_model = ct.models.neural_network.quantization_utils.quantize_weights(ct_model, bits, mode) else: print(f'{prefix} quantization only supported on macOS, skipping...') ct_model.save(f) return f, ct_model @try_export def export_engine(model, im, file, half, dynamic, simplify, workspace=4, verbose=False, prefix=colorstr('TensorRT:')): # YOLOv5 TensorRT export https://developer.nvidia.com/tensorrt assert im.device.type != 'cpu', 'export running on CPU but must be on GPU, i.e. `python export.py --device 0`' try: import tensorrt as trt except Exception: if platform.system() == 'Linux': check_requirements('nvidia-tensorrt', cmds='-U --index-url https://pypi.ngc.nvidia.com') import tensorrt as trt if trt.__version__[0] == '7': # TensorRT 7 handling https://github.com/ultralytics/yolov5/issues/6012 grid = model.model[-1].anchor_grid model.model[-1].anchor_grid = [a[..., :1, :1, :] for a in grid] export_onnx(model, im, file, 12, dynamic, simplify) # opset 12 model.model[-1].anchor_grid = grid else: # TensorRT >= 8 check_version(trt.__version__, '8.0.0', hard=True) # require tensorrt>=8.0.0 export_onnx(model, im, file, 12, dynamic, simplify) # opset 12 onnx = file.with_suffix('.onnx') LOGGER.info(f'\n{prefix} starting export with TensorRT {trt.__version__}...') assert onnx.exists(), f'failed to export ONNX file: {onnx}' f = file.with_suffix('.engine') # TensorRT engine file logger = trt.Logger(trt.Logger.INFO) if verbose: logger.min_severity = trt.Logger.Severity.VERBOSE builder = trt.Builder(logger) config = builder.create_builder_config() config.max_workspace_size = workspace * 1 << 30 # config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace << 30) # fix TRT 8.4 deprecation notice flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) network = builder.create_network(flag) parser = trt.OnnxParser(network, logger) if not parser.parse_from_file(str(onnx)): raise RuntimeError(f'failed to load ONNX file: {onnx}') inputs = [network.get_input(i) for i in range(network.num_inputs)] outputs = [network.get_output(i) for i in range(network.num_outputs)] for inp in inputs: LOGGER.info(f'{prefix} input "{inp.name}" with shape{inp.shape} {inp.dtype}') for out in outputs: LOGGER.info(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}') if dynamic: if im.shape[0] <= 1: LOGGER.warning(f'{prefix} WARNING ⚠️ --dynamic model requires maximum --batch-size argument') profile = builder.create_optimization_profile() for inp in inputs: profile.set_shape(inp.name, (1, *im.shape[1:]), (max(1, im.shape[0] // 2), *im.shape[1:]), im.shape) config.add_optimization_profile(profile) LOGGER.info(f'{prefix} building FP{16 if builder.platform_has_fast_fp16 and half else 32} engine as {f}') if builder.platform_has_fast_fp16 and half: config.set_flag(trt.BuilderFlag.FP16) with builder.build_engine(network, config) as engine, open(f, 'wb') as t: t.write(engine.serialize()) return f, None @try_export def export_saved_model(model, im, file, dynamic, tf_nms=False, agnostic_nms=False, topk_per_class=100, topk_all=100, iou_thres=0.45, conf_thres=0.25, keras=False, prefix=colorstr('TensorFlow SavedModel:')): # YOLOv5 TensorFlow SavedModel export try: import tensorflow as tf except Exception: check_requirements(f"tensorflow{'' if torch.cuda.is_available() else '-macos' if MACOS else '-cpu'}") import tensorflow as tf from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 from models.tf import TFModel LOGGER.info(f'\n{prefix} starting export with tensorflow {tf.__version__}...') f = str(file).replace('.pt', '_saved_model') batch_size, ch, *imgsz = list(im.shape) # BCHW tf_model = TFModel(cfg=model.yaml, model=model, nc=model.nc, imgsz=imgsz) im = tf.zeros((batch_size, *imgsz, ch)) # BHWC order for TensorFlow _ = tf_model.predict(im, tf_nms, agnostic_nms, topk_per_class, topk_all, iou_thres, conf_thres) inputs = tf.keras.Input(shape=(*imgsz, ch), batch_size=None if dynamic else batch_size) outputs = tf_model.predict(inputs, tf_nms, agnostic_nms, topk_per_class, topk_all, iou_thres, conf_thres) keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) keras_model.trainable = False keras_model.summary() if keras: keras_model.save(f, save_format='tf') else: spec = tf.TensorSpec(keras_model.inputs[0].shape, keras_model.inputs[0].dtype) m = tf.function(lambda x: keras_model(x)) # full model m = m.get_concrete_function(spec) frozen_func = convert_variables_to_constants_v2(m) tfm = tf.Module() tfm.__call__ = tf.function(lambda x: frozen_func(x)[:4] if tf_nms else frozen_func(x), [spec]) tfm.__call__(im) tf.saved_model.save(tfm, f, options=tf.saved_model.SaveOptions(experimental_custom_gradients=False) if check_version( tf.__version__, '2.6') else tf.saved_model.SaveOptions()) return f, keras_model @try_export def export_pb(keras_model, file, prefix=colorstr('TensorFlow GraphDef:')): # YOLOv5 TensorFlow GraphDef *.pb export https://github.com/leimao/Frozen_Graph_TensorFlow import tensorflow as tf from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2 LOGGER.info(f'\n{prefix} starting export with tensorflow {tf.__version__}...') f = file.with_suffix('.pb') m = tf.function(lambda x: keras_model(x)) # full model m = m.get_concrete_function(tf.TensorSpec(keras_model.inputs[0].shape, keras_model.inputs[0].dtype)) frozen_func = convert_variables_to_constants_v2(m) frozen_func.graph.as_graph_def() tf.io.write_graph(graph_or_graph_def=frozen_func.graph, logdir=str(f.parent), name=f.name, as_text=False) return f, None @try_export def export_tflite(keras_model, im, file, int8, data, nms, agnostic_nms, prefix=colorstr('TensorFlow Lite:')): # YOLOv5 TensorFlow Lite export import tensorflow as tf LOGGER.info(f'\n{prefix} starting export with tensorflow {tf.__version__}...') batch_size, ch, *imgsz = list(im.shape) # BCHW f = str(file).replace('.pt', '-fp16.tflite') converter = tf.lite.TFLiteConverter.from_keras_model(keras_model) converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS] converter.target_spec.supported_types = [tf.float16] converter.optimizations = [tf.lite.Optimize.DEFAULT] if int8: from models.tf import representative_dataset_gen dataset = LoadImages(check_dataset(check_yaml(data))['train'], img_size=imgsz, auto=False) converter.representative_dataset = lambda: representative_dataset_gen(dataset, ncalib=100) converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.target_spec.supported_types = [] converter.inference_input_type = tf.uint8 # or tf.int8 converter.inference_output_type = tf.uint8 # or tf.int8 converter.experimental_new_quantizer = True f = str(file).replace('.pt', '-int8.tflite') if nms or agnostic_nms: converter.target_spec.supported_ops.append(tf.lite.OpsSet.SELECT_TF_OPS) tflite_model = converter.convert() open(f, 'wb').write(tflite_model) return f, None @try_export def export_edgetpu(file, prefix=colorstr('Edge TPU:')): # YOLOv5 Edge TPU export https://coral.ai/docs/edgetpu/models-intro/ cmd = 'edgetpu_compiler --version' help_url = 'https://coral.ai/docs/edgetpu/compiler/' assert platform.system() == 'Linux', f'export only supported on Linux. See {help_url}' if subprocess.run(f'{cmd} >/dev/null', shell=True).returncode != 0: LOGGER.info(f'\n{prefix} export requires Edge TPU compiler. Attempting install from {help_url}') sudo = subprocess.run('sudo --version >/dev/null', shell=True).returncode == 0 # sudo installed on system for c in ( 'curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -', 'echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list', 'sudo apt-get update', 'sudo apt-get install edgetpu-compiler'): subprocess.run(c if sudo else c.replace('sudo ', ''), shell=True, check=True) ver = subprocess.run(cmd, shell=True, capture_output=True, check=True).stdout.decode().split()[-1] LOGGER.info(f'\n{prefix} starting export with Edge TPU compiler {ver}...') f = str(file).replace('.pt', '-int8_edgetpu.tflite') # Edge TPU model f_tfl = str(file).replace('.pt', '-int8.tflite') # TFLite model subprocess.run([ 'edgetpu_compiler', '-s', '-d', '-k', '10', '--out_dir', str(file.parent), f_tfl,], check=True) return f, None @try_export def export_tfjs(file, int8, prefix=colorstr('TensorFlow.js:')): # YOLOv5 TensorFlow.js export check_requirements('tensorflowjs') import tensorflowjs as tfjs LOGGER.info(f'\n{prefix} starting export with tensorflowjs {tfjs.__version__}...') f = str(file).replace('.pt', '_web_model') # js dir f_pb = file.with_suffix('.pb') # *.pb path f_json = f'{f}/model.json' # *.json path args = [ 'tensorflowjs_converter', '--input_format=tf_frozen_model', '--quantize_uint8' if int8 else '', '--output_node_names=Identity,Identity_1,Identity_2,Identity_3', str(f_pb), str(f),] subprocess.run([arg for arg in args if arg], check=True) json = Path(f_json).read_text() with open(f_json, 'w') as j: # sort JSON Identity_* in ascending order subst = re.sub( r'{"outputs": {"Identity.?.?": {"name": "Identity.?.?"}, ' r'"Identity.?.?": {"name": "Identity.?.?"}, ' r'"Identity.?.?": {"name": "Identity.?.?"}, ' r'"Identity.?.?": {"name": "Identity.?.?"}}}', r'{"outputs": {"Identity": {"name": "Identity"}, ' r'"Identity_1": {"name": "Identity_1"}, ' r'"Identity_2": {"name": "Identity_2"}, ' r'"Identity_3": {"name": "Identity_3"}}}', json) j.write(subst) return f, None def add_tflite_metadata(file, metadata, num_outputs): # Add metadata to *.tflite models per https://www.tensorflow.org/lite/models/convert/metadata with contextlib.suppress(ImportError): # check_requirements('tflite_support') from tflite_support import flatbuffers from tflite_support import metadata as _metadata from tflite_support import metadata_schema_py_generated as _metadata_fb tmp_file = Path('/tmp/meta.txt') with open(tmp_file, 'w') as meta_f: meta_f.write(str(metadata)) model_meta = _metadata_fb.ModelMetadataT() label_file = _metadata_fb.AssociatedFileT() label_file.name = tmp_file.name model_meta.associatedFiles = [label_file] subgraph = _metadata_fb.SubGraphMetadataT() subgraph.inputTensorMetadata = [_metadata_fb.TensorMetadataT()] subgraph.outputTensorMetadata = [_metadata_fb.TensorMetadataT()] * num_outputs model_meta.subgraphMetadata = [subgraph] b = flatbuffers.Builder(0) b.Finish(model_meta.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER) metadata_buf = b.Output() populator = _metadata.MetadataPopulator.with_model_file(file) populator.load_metadata_buffer(metadata_buf) populator.load_associated_files([str(tmp_file)]) populator.populate() tmp_file.unlink() @smart_inference_mode() def run( data=ROOT / 'data/coco128.yaml', # 'dataset.yaml path' weights=ROOT / 'yolov5s.pt', # weights path imgsz=(640, 640), # image (height, width) batch_size=1, # batch size device='cpu', # cuda device, i.e. 0 or 0,1,2,3 or cpu include=('torchscript', 'onnx'), # include formats half=False, # FP16 half-precision export inplace=False, # set YOLOv5 Detect() inplace=True keras=False, # use Keras optimize=False, # TorchScript: optimize for mobile int8=False, # CoreML/TF INT8 quantization dynamic=False, # ONNX/TF/TensorRT: dynamic axes simplify=False, # ONNX: simplify model opset=12, # ONNX: opset version verbose=False, # TensorRT: verbose log workspace=4, # TensorRT: workspace size (GB) nms=False, # TF: add NMS to model agnostic_nms=False, # TF: add agnostic NMS to model topk_per_class=100, # TF.js NMS: topk per class to keep topk_all=100, # TF.js NMS: topk for all classes to keep iou_thres=0.45, # TF.js NMS: IoU threshold conf_thres=0.25, # TF.js NMS: confidence threshold ): t = time.time() include = [x.lower() for x in include] # to lowercase fmts = tuple(export_formats()['Argument'][1:]) # --include arguments flags = [x in include for x in fmts] assert sum(flags) == len(include), f'ERROR: Invalid --include {include}, valid --include arguments are {fmts}' jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle = flags # export booleans file = Path(url2file(weights) if str(weights).startswith(('http:/', 'https:/')) else weights) # PyTorch weights # Load PyTorch model device = select_device(device) if half: assert device.type != 'cpu' or coreml, '--half only compatible with GPU export, i.e. use --device 0' assert not dynamic, '--half not compatible with --dynamic, i.e. use either --half or --dynamic but not both' model = attempt_load(weights, device=device, inplace=True, fuse=True) # load FP32 model # Checks imgsz *= 2 if len(imgsz) == 1 else 1 # expand if optimize: assert device.type == 'cpu', '--optimize not compatible with cuda devices, i.e. use --device cpu' # Input gs = int(max(model.stride)) # grid size (max stride) imgsz = [check_img_size(x, gs) for x in imgsz] # verify img_size are gs-multiples im = torch.zeros(batch_size, 3, *imgsz).to(device) # image size(1,3,320,192) BCHW iDetection # Update model model.eval() for k, m in model.named_modules(): if isinstance(m, Detect): m.inplace = inplace m.dynamic = dynamic m.export = True for _ in range(2): y = model(im) # dry runs if half and not coreml: im, model = im.half(), model.half() # to FP16 shape = tuple((y[0] if isinstance(y, tuple) else y).shape) # model output shape metadata = {'stride': int(max(model.stride)), 'names': model.names} # model metadata LOGGER.info(f"\n{colorstr('PyTorch:')} starting from {file} with output shape {shape} ({file_size(file):.1f} MB)") # Exports f = [''] * len(fmts) # exported filenames warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning) # suppress TracerWarning if jit: # TorchScript f[0], _ = export_torchscript(model, im, file, optimize) if engine: # TensorRT required before ONNX f[1], _ = export_engine(model, im, file, half, dynamic, simplify, workspace, verbose) if onnx or xml: # OpenVINO requires ONNX f[2], _ = export_onnx(model, im, file, opset, dynamic, simplify) if xml: # OpenVINO f[3], _ = export_openvino(file, metadata, half) if coreml: # CoreML f[4], _ = export_coreml(model, im, file, int8, half) if any((saved_model, pb, tflite, edgetpu, tfjs)): # TensorFlow formats assert not tflite or not tfjs, 'TFLite and TF.js models must be exported separately, please pass only one type.' assert not isinstance(model, ClassificationModel), 'ClassificationModel export to TF formats not yet supported.' f[5], s_model = export_saved_model(model.cpu(), im, file, dynamic, tf_nms=nms or agnostic_nms or tfjs, agnostic_nms=agnostic_nms or tfjs, topk_per_class=topk_per_class, topk_all=topk_all, iou_thres=iou_thres, conf_thres=conf_thres, keras=keras) if pb or tfjs: # pb prerequisite to tfjs f[6], _ = export_pb(s_model, file) if tflite or edgetpu: f[7], _ = export_tflite(s_model, im, file, int8 or edgetpu, data=data, nms=nms, agnostic_nms=agnostic_nms) if edgetpu: f[8], _ = export_edgetpu(file) add_tflite_metadata(f[8] or f[7], metadata, num_outputs=len(s_model.outputs)) if tfjs: f[9], _ = export_tfjs(file, int8) if paddle: # PaddlePaddle f[10], _ = export_paddle(model, im, file, metadata) # Finish f = [str(x) for x in f if x] # filter out '' and None if any(f): cls, det, seg = (isinstance(model, x) for x in (ClassificationModel, DetectionModel, SegmentationModel)) # type det &= not seg # segmentation models inherit from SegmentationModel(DetectionModel) dir = Path('segment' if seg else 'classify' if cls else '') h = '--half' if half else '' # --half FP16 inference arg s = '# WARNING ⚠️ ClassificationModel not yet supported for PyTorch Hub AutoShape inference' if cls else \ '# WARNING ⚠️ SegmentationModel not yet supported for PyTorch Hub AutoShape inference' if seg else '' LOGGER.info(f'\nExport complete ({time.time() - t:.1f}s)' f"\nResults saved to {colorstr('bold', file.parent.resolve())}" f"\nDetect: python {dir / ('detect.py' if det else 'predict.py')} --weights {f[-1]} {h}" f"\nValidate: python {dir / 'val.py'} --weights {f[-1]} {h}" f"\nPyTorch Hub: model = torch.hub.load('ultralytics/yolov5', 'custom', '{f[-1]}') {s}" f'\nVisualize: https://netron.app') return f # return list of exported files/dirs def parse_opt(known=False): parser = argparse.ArgumentParser() parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path') parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model.pt path(s)') parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640, 640], help='image (h, w)') parser.add_argument('--batch-size', type=int, default=1, help='batch size') parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--half', action='store_true', help='FP16 half-precision export') parser.add_argument('--inplace', action='store_true', help='set YOLOv5 Detect() inplace=True') parser.add_argument('--keras', action='store_true', help='TF: use Keras') parser.add_argument('--optimize', action='store_true', help='TorchScript: optimize for mobile') parser.add_argument('--int8', action='store_true', help='CoreML/TF INT8 quantization') parser.add_argument('--dynamic', action='store_true', help='ONNX/TF/TensorRT: dynamic axes') parser.add_argument('--simplify', action='store_true', help='ONNX: simplify model') parser.add_argument('--opset', type=int, default=17, help='ONNX: opset version') parser.add_argument('--verbose', action='store_true', help='TensorRT: verbose log') parser.add_argument('--workspace', type=int, default=4, help='TensorRT: workspace size (GB)') parser.add_argument('--nms', action='store_true', help='TF: add NMS to model') parser.add_argument('--agnostic-nms', action='store_true', help='TF: add agnostic NMS to model') parser.add_argument('--topk-per-class', type=int, default=100, help='TF.js NMS: topk per class to keep') parser.add_argument('--topk-all', type=int, default=100, help='TF.js NMS: topk for all classes to keep') parser.add_argument('--iou-thres', type=float, default=0.45, help='TF.js NMS: IoU threshold') parser.add_argument('--conf-thres', type=float, default=0.25, help='TF.js NMS: confidence threshold') parser.add_argument( '--include', nargs='+', default=['torchscript'], help='torchscript, onnx, openvino, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle') opt = parser.parse_known_args()[0] if known else parser.parse_args() print_args(vars(opt)) return opt def main(opt): for opt.weights in (opt.weights if isinstance(opt.weights, list) else [opt.weights]): run(**vars(opt)) if __name__ == '__main__': opt = parse_opt() main(opt) ================================================ FILE: yolo-improve/yolov5-AUX/hubconf.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ PyTorch Hub models https://pytorch.org/hub/ultralytics_yolov5 Usage: import torch model = torch.hub.load('ultralytics/yolov5', 'yolov5s') # official model model = torch.hub.load('ultralytics/yolov5:master', 'yolov5s') # from branch model = torch.hub.load('ultralytics/yolov5', 'custom', 'yolov5s.pt') # custom/local model model = torch.hub.load('.', 'custom', 'yolov5s.pt', source='local') # local repo """ import torch def _create(name, pretrained=True, channels=3, classes=80, autoshape=True, verbose=True, device=None): """Creates or loads a YOLOv5 model Arguments: name (str): model name 'yolov5s' or path 'path/to/best.pt' pretrained (bool): load pretrained weights into the model channels (int): number of input channels classes (int): number of model classes autoshape (bool): apply YOLOv5 .autoshape() wrapper to model verbose (bool): print all information to screen device (str, torch.device, None): device to use for model parameters Returns: YOLOv5 model """ from pathlib import Path from models.common import AutoShape, DetectMultiBackend from models.experimental import attempt_load from models.yolo import ClassificationModel, DetectionModel, SegmentationModel from utils.downloads import attempt_download from utils.general import LOGGER, check_requirements, intersect_dicts, logging from utils.torch_utils import select_device if not verbose: LOGGER.setLevel(logging.WARNING) check_requirements(exclude=('opencv-python', 'tensorboard', 'thop')) name = Path(name) path = name.with_suffix('.pt') if name.suffix == '' and not name.is_dir() else name # checkpoint path try: device = select_device(device) if pretrained and channels == 3 and classes == 80: try: model = DetectMultiBackend(path, device=device, fuse=autoshape) # detection model if autoshape: if model.pt and isinstance(model.model, ClassificationModel): LOGGER.warning('WARNING ⚠️ YOLOv5 ClassificationModel is not yet AutoShape compatible. ' 'You must pass torch tensors in BCHW to this model, i.e. shape(1,3,224,224).') elif model.pt and isinstance(model.model, SegmentationModel): LOGGER.warning('WARNING ⚠️ YOLOv5 SegmentationModel is not yet AutoShape compatible. ' 'You will not be able to run inference with this model.') else: model = AutoShape(model) # for file/URI/PIL/cv2/np inputs and NMS except Exception: model = attempt_load(path, device=device, fuse=False) # arbitrary model else: cfg = list((Path(__file__).parent / 'models').rglob(f'{path.stem}.yaml'))[0] # model.yaml path model = DetectionModel(cfg, channels, classes) # create model if pretrained: ckpt = torch.load(attempt_download(path), map_location=device) # load csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=['anchors']) # intersect model.load_state_dict(csd, strict=False) # load if len(ckpt['model'].names) == classes: model.names = ckpt['model'].names # set class names attribute if not verbose: LOGGER.setLevel(logging.INFO) # reset to default return model.to(device) except Exception as e: help_url = 'https://github.com/ultralytics/yolov5/issues/36' s = f'{e}. Cache may be out of date, try `force_reload=True` or see {help_url} for help.' raise Exception(s) from e def custom(path='path/to/model.pt', autoshape=True, _verbose=True, device=None): # YOLOv5 custom or local model return _create(path, autoshape=autoshape, verbose=_verbose, device=device) def yolov5n(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-nano model https://github.com/ultralytics/yolov5 return _create('yolov5n', pretrained, channels, classes, autoshape, _verbose, device) def yolov5s(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-small model https://github.com/ultralytics/yolov5 return _create('yolov5s', pretrained, channels, classes, autoshape, _verbose, device) def yolov5m(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-medium model https://github.com/ultralytics/yolov5 return _create('yolov5m', pretrained, channels, classes, autoshape, _verbose, device) def yolov5l(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-large model https://github.com/ultralytics/yolov5 return _create('yolov5l', pretrained, channels, classes, autoshape, _verbose, device) def yolov5x(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-xlarge model https://github.com/ultralytics/yolov5 return _create('yolov5x', pretrained, channels, classes, autoshape, _verbose, device) def yolov5n6(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-nano-P6 model https://github.com/ultralytics/yolov5 return _create('yolov5n6', pretrained, channels, classes, autoshape, _verbose, device) def yolov5s6(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-small-P6 model https://github.com/ultralytics/yolov5 return _create('yolov5s6', pretrained, channels, classes, autoshape, _verbose, device) def yolov5m6(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-medium-P6 model https://github.com/ultralytics/yolov5 return _create('yolov5m6', pretrained, channels, classes, autoshape, _verbose, device) def yolov5l6(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-large-P6 model https://github.com/ultralytics/yolov5 return _create('yolov5l6', pretrained, channels, classes, autoshape, _verbose, device) def yolov5x6(pretrained=True, channels=3, classes=80, autoshape=True, _verbose=True, device=None): # YOLOv5-xlarge-P6 model https://github.com/ultralytics/yolov5 return _create('yolov5x6', pretrained, channels, classes, autoshape, _verbose, device) if __name__ == '__main__': import argparse from pathlib import Path import numpy as np from PIL import Image from utils.general import cv2, print_args # Argparser parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default='yolov5s', help='model name') opt = parser.parse_args() print_args(vars(opt)) # Model model = _create(name=opt.model, pretrained=True, channels=3, classes=80, autoshape=True, verbose=True) # model = custom(path='path/to/model.pt') # custom # Images imgs = [ 'data/images/zidane.jpg', # filename Path('data/images/zidane.jpg'), # Path 'https://ultralytics.com/images/zidane.jpg', # URI cv2.imread('data/images/bus.jpg')[:, :, ::-1], # OpenCV Image.open('data/images/bus.jpg'), # PIL np.zeros((320, 640, 3))] # numpy # Inference results = model(imgs, size=320) # batched inference # Results results.print() results.save() ================================================ FILE: yolo-improve/yolov5-AUX/models/__init__.py ================================================ ================================================ FILE: yolo-improve/yolov5-AUX/models/common.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Common modules """ import ast import contextlib import json import math import platform import warnings import zipfile from collections import OrderedDict, namedtuple from copy import copy from pathlib import Path from urllib.parse import urlparse import cv2 import numpy as np import pandas as pd import requests import torch import torch.nn as nn from IPython.display import display from PIL import Image from torch.cuda import amp from utils import TryExcept from utils.dataloaders import exif_transpose, letterbox from utils.general import (LOGGER, ROOT, Profile, check_requirements, check_suffix, check_version, colorstr, increment_path, is_notebook, make_divisible, non_max_suppression, scale_boxes, xywh2xyxy, xyxy2xywh, yaml_load) from utils.plots import Annotator, colors, save_one_box from utils.torch_utils import copy_attr, smart_inference_mode def autopad(k, p=None, d=1): # kernel, padding, dilation # Pad to 'same' shape outputs if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv(nn.Module): # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation) default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): super().__init__() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): return self.act(self.conv(x)) class DWConv(Conv): # Depth-wise convolution def __init__(self, c1, c2, k=1, s=1, d=1, act=True): # ch_in, ch_out, kernel, stride, dilation, activation super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act) class DWConvTranspose2d(nn.ConvTranspose2d): # Depth-wise transpose convolution def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): # ch_in, ch_out, kernel, stride, padding, padding_out super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2)) class TransformerLayer(nn.Module): # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance) def __init__(self, c, num_heads): super().__init__() self.q = nn.Linear(c, c, bias=False) self.k = nn.Linear(c, c, bias=False) self.v = nn.Linear(c, c, bias=False) self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads) self.fc1 = nn.Linear(c, c, bias=False) self.fc2 = nn.Linear(c, c, bias=False) def forward(self, x): x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x x = self.fc2(self.fc1(x)) + x return x class TransformerBlock(nn.Module): # Vision Transformer https://arxiv.org/abs/2010.11929 def __init__(self, c1, c2, num_heads, num_layers): super().__init__() self.conv = None if c1 != c2: self.conv = Conv(c1, c2) self.linear = nn.Linear(c2, c2) # learnable position embedding self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers))) self.c2 = c2 def forward(self, x): if self.conv is not None: x = self.conv(x) b, _, w, h = x.shape p = x.flatten(2).permute(2, 0, 1) return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h) class Bottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c_, c2, 3, 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class BottleneckCSP(nn.Module): # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) self.cv4 = Conv(2 * c_, c2, 1, 1) self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) self.act = nn.SiLU() self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) def forward(self, x): y1 = self.cv3(self.m(self.cv1(x))) y2 = self.cv2(x) return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1)))) class CrossConv(nn.Module): # Cross Convolution Downsample def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False): # ch_in, ch_out, kernel, stride, groups, expansion, shortcut super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, (1, k), (1, s)) self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) class C3x(C3): # C3 module with cross-convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = nn.Sequential(*(CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n))) class C3TR(C3): # C3 module with TransformerBlock() def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = TransformerBlock(c_, c_, 4, n) class C3SPP(C3): # C3 module with SPP() def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = SPP(c_, c_, k) class C3Ghost(C3): # C3 module with GhostBottleneck() def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) # hidden channels self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n))) class SPP(nn.Module): # Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729 def __init__(self, c1, c2, k=(5, 9, 13)): super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1) self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) def forward(self, x): x = self.cv1(x) with warnings.catch_warnings(): warnings.simplefilter('ignore') # suppress torch 1.9.0 max_pool2d() warning return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) class SPPF(nn.Module): # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13)) super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c_ * 4, c2, 1, 1) self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) def forward(self, x): x = self.cv1(x) with warnings.catch_warnings(): warnings.simplefilter('ignore') # suppress torch 1.9.0 max_pool2d() warning y1 = self.m(x) y2 = self.m(y1) return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1)) class Focus(nn.Module): # Focus wh information into c-space def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups super().__init__() self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act) # self.contract = Contract(gain=2) def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1)) # return self.conv(self.contract(x)) class GhostConv(nn.Module): # Ghost Convolution https://github.com/huawei-noah/ghostnet def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups super().__init__() c_ = c2 // 2 # hidden channels self.cv1 = Conv(c1, c_, k, s, None, g, act=act) self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act) def forward(self, x): y = self.cv1(x) return torch.cat((y, self.cv2(y)), 1) class GhostBottleneck(nn.Module): # Ghost Bottleneck https://github.com/huawei-noah/ghostnet def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride super().__init__() c_ = c2 // 2 self.conv = nn.Sequential( GhostConv(c1, c_, 1, 1), # pw DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw GhostConv(c_, c2, 1, 1, act=False)) # pw-linear self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity() def forward(self, x): return self.conv(x) + self.shortcut(x) class Contract(nn.Module): # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40) def __init__(self, gain=2): super().__init__() self.gain = gain def forward(self, x): b, c, h, w = x.size() # assert (h / s == 0) and (W / s == 0), 'Indivisible gain' s = self.gain x = x.view(b, c, h // s, s, w // s, s) # x(1,64,40,2,40,2) x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40) return x.view(b, c * s * s, h // s, w // s) # x(1,256,40,40) class Expand(nn.Module): # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160) def __init__(self, gain=2): super().__init__() self.gain = gain def forward(self, x): b, c, h, w = x.size() # assert C / s ** 2 == 0, 'Indivisible gain' s = self.gain x = x.view(b, s, s, c // s ** 2, h, w) # x(1,2,2,16,80,80) x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2) return x.view(b, c // s ** 2, h * s, w * s) # x(1,16,160,160) class Concat(nn.Module): # Concatenate a list of tensors along dimension def __init__(self, dimension=1): super().__init__() self.d = dimension def forward(self, x): return torch.cat(x, self.d) class DetectMultiBackend(nn.Module): # YOLOv5 MultiBackend class for python inference on various backends def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), dnn=False, data=None, fp16=False, fuse=True): # Usage: # PyTorch: weights = *.pt # TorchScript: *.torchscript # ONNX Runtime: *.onnx # ONNX OpenCV DNN: *.onnx --dnn # OpenVINO: *_openvino_model # CoreML: *.mlmodel # TensorRT: *.engine # TensorFlow SavedModel: *_saved_model # TensorFlow GraphDef: *.pb # TensorFlow Lite: *.tflite # TensorFlow Edge TPU: *_edgetpu.tflite # PaddlePaddle: *_paddle_model from models.experimental import attempt_download, attempt_load # scoped to avoid circular import super().__init__() w = str(weights[0] if isinstance(weights, list) else weights) pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle, triton = self._model_type(w) fp16 &= pt or jit or onnx or engine # FP16 nhwc = coreml or saved_model or pb or tflite or edgetpu # BHWC formats (vs torch BCWH) stride = 32 # default stride cuda = torch.cuda.is_available() and device.type != 'cpu' # use CUDA if not (pt or triton): w = attempt_download(w) # download if not local if pt: # PyTorch model = attempt_load(weights if isinstance(weights, list) else w, device=device, inplace=True, fuse=fuse) stride = max(int(model.stride.max()), 32) # model stride names = model.module.names if hasattr(model, 'module') else model.names # get class names model.half() if fp16 else model.float() self.model = model # explicitly assign for to(), cpu(), cuda(), half() elif jit: # TorchScript LOGGER.info(f'Loading {w} for TorchScript inference...') extra_files = {'config.txt': ''} # model metadata model = torch.jit.load(w, _extra_files=extra_files, map_location=device) model.half() if fp16 else model.float() if extra_files['config.txt']: # load metadata dict d = json.loads(extra_files['config.txt'], object_hook=lambda d: {int(k) if k.isdigit() else k: v for k, v in d.items()}) stride, names = int(d['stride']), d['names'] elif dnn: # ONNX OpenCV DNN LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...') check_requirements('opencv-python>=4.5.4') net = cv2.dnn.readNetFromONNX(w) elif onnx: # ONNX Runtime LOGGER.info(f'Loading {w} for ONNX Runtime inference...') check_requirements(('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime')) import onnxruntime providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if cuda else ['CPUExecutionProvider'] session = onnxruntime.InferenceSession(w, providers=providers) output_names = [x.name for x in session.get_outputs()] meta = session.get_modelmeta().custom_metadata_map # metadata if 'stride' in meta: stride, names = int(meta['stride']), eval(meta['names']) elif xml: # OpenVINO LOGGER.info(f'Loading {w} for OpenVINO inference...') check_requirements('openvino') # requires openvino-dev: https://pypi.org/project/openvino-dev/ from openvino.runtime import Core, Layout, get_batch ie = Core() if not Path(w).is_file(): # if not *.xml w = next(Path(w).glob('*.xml')) # get *.xml file from *_openvino_model dir network = ie.read_model(model=w, weights=Path(w).with_suffix('.bin')) if network.get_parameters()[0].get_layout().empty: network.get_parameters()[0].set_layout(Layout('NCHW')) batch_dim = get_batch(network) if batch_dim.is_static: batch_size = batch_dim.get_length() executable_network = ie.compile_model(network, device_name='CPU') # device_name="MYRIAD" for Intel NCS2 stride, names = self._load_metadata(Path(w).with_suffix('.yaml')) # load metadata elif engine: # TensorRT LOGGER.info(f'Loading {w} for TensorRT inference...') import tensorrt as trt # https://developer.nvidia.com/nvidia-tensorrt-download check_version(trt.__version__, '7.0.0', hard=True) # require tensorrt>=7.0.0 if device.type == 'cpu': device = torch.device('cuda:0') Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) logger = trt.Logger(trt.Logger.INFO) with open(w, 'rb') as f, trt.Runtime(logger) as runtime: model = runtime.deserialize_cuda_engine(f.read()) context = model.create_execution_context() bindings = OrderedDict() output_names = [] fp16 = False # default updated below dynamic = False for i in range(model.num_bindings): name = model.get_binding_name(i) dtype = trt.nptype(model.get_binding_dtype(i)) if model.binding_is_input(i): if -1 in tuple(model.get_binding_shape(i)): # dynamic dynamic = True context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2])) if dtype == np.float16: fp16 = True else: # output output_names.append(name) shape = tuple(context.get_binding_shape(i)) im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr())) binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items()) batch_size = bindings['images'].shape[0] # if dynamic, this is instead max batch size elif coreml: # CoreML LOGGER.info(f'Loading {w} for CoreML inference...') import coremltools as ct model = ct.models.MLModel(w) elif saved_model: # TF SavedModel LOGGER.info(f'Loading {w} for TensorFlow SavedModel inference...') import tensorflow as tf keras = False # assume TF1 saved_model model = tf.keras.models.load_model(w) if keras else tf.saved_model.load(w) elif pb: # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt LOGGER.info(f'Loading {w} for TensorFlow GraphDef inference...') import tensorflow as tf def wrap_frozen_graph(gd, inputs, outputs): x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=''), []) # wrapped ge = x.graph.as_graph_element return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs)) def gd_outputs(gd): name_list, input_list = [], [] for node in gd.node: # tensorflow.core.framework.node_def_pb2.NodeDef name_list.append(node.name) input_list.extend(node.input) return sorted(f'{x}:0' for x in list(set(name_list) - set(input_list)) if not x.startswith('NoOp')) gd = tf.Graph().as_graph_def() # TF GraphDef with open(w, 'rb') as f: gd.ParseFromString(f.read()) frozen_func = wrap_frozen_graph(gd, inputs='x:0', outputs=gd_outputs(gd)) elif tflite or edgetpu: # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python try: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu from tflite_runtime.interpreter import Interpreter, load_delegate except ImportError: import tensorflow as tf Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate, if edgetpu: # TF Edge TPU https://coral.ai/software/#edgetpu-runtime LOGGER.info(f'Loading {w} for TensorFlow Lite Edge TPU inference...') delegate = { 'Linux': 'libedgetpu.so.1', 'Darwin': 'libedgetpu.1.dylib', 'Windows': 'edgetpu.dll'}[platform.system()] interpreter = Interpreter(model_path=w, experimental_delegates=[load_delegate(delegate)]) else: # TFLite LOGGER.info(f'Loading {w} for TensorFlow Lite inference...') interpreter = Interpreter(model_path=w) # load TFLite model interpreter.allocate_tensors() # allocate input_details = interpreter.get_input_details() # inputs output_details = interpreter.get_output_details() # outputs # load metadata with contextlib.suppress(zipfile.BadZipFile): with zipfile.ZipFile(w, 'r') as model: meta_file = model.namelist()[0] meta = ast.literal_eval(model.read(meta_file).decode('utf-8')) stride, names = int(meta['stride']), meta['names'] elif tfjs: # TF.js raise NotImplementedError('ERROR: YOLOv5 TF.js inference is not supported') elif paddle: # PaddlePaddle LOGGER.info(f'Loading {w} for PaddlePaddle inference...') check_requirements('paddlepaddle-gpu' if cuda else 'paddlepaddle') import paddle.inference as pdi if not Path(w).is_file(): # if not *.pdmodel w = next(Path(w).rglob('*.pdmodel')) # get *.pdmodel file from *_paddle_model dir weights = Path(w).with_suffix('.pdiparams') config = pdi.Config(str(w), str(weights)) if cuda: config.enable_use_gpu(memory_pool_init_size_mb=2048, device_id=0) predictor = pdi.create_predictor(config) input_handle = predictor.get_input_handle(predictor.get_input_names()[0]) output_names = predictor.get_output_names() elif triton: # NVIDIA Triton Inference Server LOGGER.info(f'Using {w} as Triton Inference Server...') check_requirements('tritonclient[all]') from utils.triton import TritonRemoteModel model = TritonRemoteModel(url=w) nhwc = model.runtime.startswith('tensorflow') else: raise NotImplementedError(f'ERROR: {w} is not a supported format') # class names if 'names' not in locals(): names = yaml_load(data)['names'] if data else {i: f'class{i}' for i in range(999)} if names[0] == 'n01440764' and len(names) == 1000: # ImageNet names = yaml_load(ROOT / 'data/ImageNet.yaml')['names'] # human-readable names self.__dict__.update(locals()) # assign all variables to self def forward(self, im, augment=False, visualize=False): # YOLOv5 MultiBackend inference b, ch, h, w = im.shape # batch, channel, height, width if self.fp16 and im.dtype != torch.float16: im = im.half() # to FP16 if self.nhwc: im = im.permute(0, 2, 3, 1) # torch BCHW to numpy BHWC shape(1,320,192,3) if self.pt: # PyTorch y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im) elif self.jit: # TorchScript y = self.model(im) elif self.dnn: # ONNX OpenCV DNN im = im.cpu().numpy() # torch to numpy self.net.setInput(im) y = self.net.forward() elif self.onnx: # ONNX Runtime im = im.cpu().numpy() # torch to numpy y = self.session.run(self.output_names, {self.session.get_inputs()[0].name: im}) elif self.xml: # OpenVINO im = im.cpu().numpy() # FP32 y = list(self.executable_network([im]).values()) elif self.engine: # TensorRT if self.dynamic and im.shape != self.bindings['images'].shape: i = self.model.get_binding_index('images') self.context.set_binding_shape(i, im.shape) # reshape if dynamic self.bindings['images'] = self.bindings['images']._replace(shape=im.shape) for name in self.output_names: i = self.model.get_binding_index(name) self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i))) s = self.bindings['images'].shape assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}" self.binding_addrs['images'] = int(im.data_ptr()) self.context.execute_v2(list(self.binding_addrs.values())) y = [self.bindings[x].data for x in sorted(self.output_names)] elif self.coreml: # CoreML im = im.cpu().numpy() im = Image.fromarray((im[0] * 255).astype('uint8')) # im = im.resize((192, 320), Image.ANTIALIAS) y = self.model.predict({'image': im}) # coordinates are xywh normalized if 'confidence' in y: box = xywh2xyxy(y['coordinates'] * [[w, h, w, h]]) # xyxy pixels conf, cls = y['confidence'].max(1), y['confidence'].argmax(1).astype(np.float) y = np.concatenate((box, conf.reshape(-1, 1), cls.reshape(-1, 1)), 1) else: y = list(reversed(y.values())) # reversed for segmentation models (pred, proto) elif self.paddle: # PaddlePaddle im = im.cpu().numpy().astype(np.float32) self.input_handle.copy_from_cpu(im) self.predictor.run() y = [self.predictor.get_output_handle(x).copy_to_cpu() for x in self.output_names] elif self.triton: # NVIDIA Triton Inference Server y = self.model(im) else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU) im = im.cpu().numpy() if self.saved_model: # SavedModel y = self.model(im, training=False) if self.keras else self.model(im) elif self.pb: # GraphDef y = self.frozen_func(x=self.tf.constant(im)) else: # Lite or Edge TPU input = self.input_details[0] int8 = input['dtype'] == np.uint8 # is TFLite quantized uint8 model if int8: scale, zero_point = input['quantization'] im = (im / scale + zero_point).astype(np.uint8) # de-scale self.interpreter.set_tensor(input['index'], im) self.interpreter.invoke() y = [] for output in self.output_details: x = self.interpreter.get_tensor(output['index']) if int8: scale, zero_point = output['quantization'] x = (x.astype(np.float32) - zero_point) * scale # re-scale y.append(x) y = [x if isinstance(x, np.ndarray) else x.numpy() for x in y] y[0][..., :4] *= [w, h, w, h] # xywh normalized to pixels if isinstance(y, (list, tuple)): return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y] else: return self.from_numpy(y) def from_numpy(self, x): return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x def warmup(self, imgsz=(1, 3, 640, 640)): # Warmup model by running inference once warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton if any(warmup_types) and (self.device.type != 'cpu' or self.triton): im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input for _ in range(2 if self.jit else 1): # self.forward(im) # warmup @staticmethod def _model_type(p='path/to/model.pt'): # Return model type from model path, i.e. path='path/to/model.onnx' -> type=onnx # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle] from export import export_formats from utils.downloads import is_url sf = list(export_formats().Suffix) # export suffixes if not is_url(p, check=False): check_suffix(p, sf) # checks url = urlparse(p) # if url may be Triton inference server types = [s in Path(p).name for s in sf] types[8] &= not types[9] # tflite &= not edgetpu triton = not any(types) and all([any(s in url.scheme for s in ['http', 'grpc']), url.netloc]) return types + [triton] @staticmethod def _load_metadata(f=Path('path/to/meta.yaml')): # Load metadata from meta.yaml if it exists if f.exists(): d = yaml_load(f) return d['stride'], d['names'] # assign stride, names return None, None class AutoShape(nn.Module): # YOLOv5 input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS conf = 0.25 # NMS confidence threshold iou = 0.45 # NMS IoU threshold agnostic = False # NMS class-agnostic multi_label = False # NMS multiple labels per box classes = None # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs max_det = 1000 # maximum number of detections per image amp = False # Automatic Mixed Precision (AMP) inference def __init__(self, model, verbose=True): super().__init__() if verbose: LOGGER.info('Adding AutoShape... ') copy_attr(self, model, include=('yaml', 'nc', 'hyp', 'names', 'stride', 'abc'), exclude=()) # copy attributes self.dmb = isinstance(model, DetectMultiBackend) # DetectMultiBackend() instance self.pt = not self.dmb or model.pt # PyTorch model self.model = model.eval() if self.pt: m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect() m.inplace = False # Detect.inplace=False for safe multithread inference m.export = True # do not output loss values def _apply(self, fn): # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers self = super()._apply(fn) if self.pt: m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect() m.stride = fn(m.stride) m.grid = list(map(fn, m.grid)) if isinstance(m.anchor_grid, list): m.anchor_grid = list(map(fn, m.anchor_grid)) return self @smart_inference_mode() def forward(self, ims, size=640, augment=False, profile=False): # Inference from various sources. For size(height=640, width=1280), RGB images example inputs are: # file: ims = 'data/images/zidane.jpg' # str or PosixPath # URI: = 'https://ultralytics.com/images/zidane.jpg' # OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(640,1280,3) # PIL: = Image.open('image.jpg') or ImageGrab.grab() # HWC x(640,1280,3) # numpy: = np.zeros((640,1280,3)) # HWC # torch: = torch.zeros(16,3,320,640) # BCHW (scaled to size=640, 0-1 values) # multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of images dt = (Profile(), Profile(), Profile()) with dt[0]: if isinstance(size, int): # expand size = (size, size) p = next(self.model.parameters()) if self.pt else torch.empty(1, device=self.model.device) # param autocast = self.amp and (p.device.type != 'cpu') # Automatic Mixed Precision (AMP) inference if isinstance(ims, torch.Tensor): # torch with amp.autocast(autocast): return self.model(ims.to(p.device).type_as(p), augment=augment) # inference # Pre-process n, ims = (len(ims), list(ims)) if isinstance(ims, (list, tuple)) else (1, [ims]) # number, list of images shape0, shape1, files = [], [], [] # image and inference shapes, filenames for i, im in enumerate(ims): f = f'image{i}' # filename if isinstance(im, (str, Path)): # filename or uri im, f = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im), im im = np.asarray(exif_transpose(im)) elif isinstance(im, Image.Image): # PIL Image im, f = np.asarray(exif_transpose(im)), getattr(im, 'filename', f) or f files.append(Path(f).with_suffix('.jpg').name) if im.shape[0] < 5: # image in CHW im = im.transpose((1, 2, 0)) # reverse dataloader .transpose(2, 0, 1) im = im[..., :3] if im.ndim == 3 else cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # enforce 3ch input s = im.shape[:2] # HWC shape0.append(s) # image shape g = max(size) / max(s) # gain shape1.append([int(y * g) for y in s]) ims[i] = im if im.data.contiguous else np.ascontiguousarray(im) # update shape1 = [make_divisible(x, self.stride) for x in np.array(shape1).max(0)] # inf shape x = [letterbox(im, shape1, auto=False)[0] for im in ims] # pad x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2))) # stack and BHWC to BCHW x = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32 with amp.autocast(autocast): # Inference with dt[1]: y = self.model(x, augment=augment) # forward # Post-process with dt[2]: y = non_max_suppression(y if self.dmb else y[0], self.conf, self.iou, self.classes, self.agnostic, self.multi_label, max_det=self.max_det) # NMS for i in range(n): scale_boxes(shape1, y[i][:, :4], shape0[i]) return Detections(ims, y, files, dt, self.names, x.shape) class Detections: # YOLOv5 detections class for inference results def __init__(self, ims, pred, files, times=(0, 0, 0), names=None, shape=None): super().__init__() d = pred[0].device # device gn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in ims] # normalizations self.ims = ims # list of images as numpy arrays self.pred = pred # list of tensors pred[0] = (xyxy, conf, cls) self.names = names # class names self.files = files # image filenames self.times = times # profiling times self.xyxy = pred # xyxy pixels self.xywh = [xyxy2xywh(x) for x in pred] # xywh pixels self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)] # xyxy normalized self.xywhn = [x / g for x, g in zip(self.xywh, gn)] # xywh normalized self.n = len(self.pred) # number of images (batch size) self.t = tuple(x.t / self.n * 1E3 for x in times) # timestamps (ms) self.s = tuple(shape) # inference BCHW shape def _run(self, pprint=False, show=False, save=False, crop=False, render=False, labels=True, save_dir=Path('')): s, crops = '', [] for i, (im, pred) in enumerate(zip(self.ims, self.pred)): s += f'\nimage {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} ' # string if pred.shape[0]: for c in pred[:, -1].unique(): n = (pred[:, -1] == c).sum() # detections per class s += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " # add to string s = s.rstrip(', ') if show or save or render or crop: annotator = Annotator(im, example=str(self.names)) for *box, conf, cls in reversed(pred): # xyxy, confidence, class label = f'{self.names[int(cls)]} {conf:.2f}' if crop: file = save_dir / 'crops' / self.names[int(cls)] / self.files[i] if save else None crops.append({ 'box': box, 'conf': conf, 'cls': cls, 'label': label, 'im': save_one_box(box, im, file=file, save=save)}) else: # all others annotator.box_label(box, label if labels else '', color=colors(cls)) im = annotator.im else: s += '(no detections)' im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im # from np if show: display(im) if is_notebook() else im.show(self.files[i]) if save: f = self.files[i] im.save(save_dir / f) # save if i == self.n - 1: LOGGER.info(f"Saved {self.n} image{'s' * (self.n > 1)} to {colorstr('bold', save_dir)}") if render: self.ims[i] = np.asarray(im) if pprint: s = s.lstrip('\n') return f'{s}\nSpeed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {self.s}' % self.t if crop: if save: LOGGER.info(f'Saved results to {save_dir}\n') return crops @TryExcept('Showing images is not supported in this environment') def show(self, labels=True): self._run(show=True, labels=labels) # show results def save(self, labels=True, save_dir='runs/detect/exp', exist_ok=False): save_dir = increment_path(save_dir, exist_ok, mkdir=True) # increment save_dir self._run(save=True, labels=labels, save_dir=save_dir) # save results def crop(self, save=True, save_dir='runs/detect/exp', exist_ok=False): save_dir = increment_path(save_dir, exist_ok, mkdir=True) if save else None return self._run(crop=True, save=save, save_dir=save_dir) # crop results def render(self, labels=True): self._run(render=True, labels=labels) # render results return self.ims def pandas(self): # return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0]) new = copy(self) # return copy ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name' # xyxy columns cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name' # xywh columns for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]): a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)] # update setattr(new, k, [pd.DataFrame(x, columns=c) for x in a]) return new def tolist(self): # return a list of Detections objects, i.e. 'for result in results.tolist():' r = range(self.n) # iterable x = [Detections([self.ims[i]], [self.pred[i]], [self.files[i]], self.times, self.names, self.s) for i in r] # for d in x: # for k in ['ims', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']: # setattr(d, k, getattr(d, k)[0]) # pop out of list return x def print(self): LOGGER.info(self.__str__()) def __len__(self): # override len(results) return self.n def __str__(self): # override print(results) return self._run(pprint=True) # print results def __repr__(self): return f'YOLOv5 {self.__class__} instance\n' + self.__str__() class Proto(nn.Module): # YOLOv5 mask Proto module for segmentation models def __init__(self, c1, c_=256, c2=32): # ch_in, number of protos, number of masks super().__init__() self.cv1 = Conv(c1, c_, k=3) self.upsample = nn.Upsample(scale_factor=2, mode='nearest') self.cv2 = Conv(c_, c_, k=3) self.cv3 = Conv(c_, c2) def forward(self, x): return self.cv3(self.cv2(self.upsample(self.cv1(x)))) class Classify(nn.Module): # YOLOv5 classification head, i.e. x(b,c1,20,20) to x(b,c2) def __init__(self, c1, c2, k=1, s=1, p=None, g=1, dropout_p=0.0): # ch_in, ch_out, kernel, stride, padding, groups, dropout probability super().__init__() c_ = 1280 # efficientnet_b0 size self.conv = Conv(c1, c_, k, s, autopad(k, p), g) self.pool = nn.AdaptiveAvgPool2d(1) # to x(b,c_,1,1) self.drop = nn.Dropout(p=dropout_p, inplace=True) self.linear = nn.Linear(c_, c2) # to x(b,c2) def forward(self, x): if isinstance(x, list): x = torch.cat(x, 1) return self.linear(self.drop(self.pool(self.conv(x)).flatten(1))) ================================================ FILE: yolo-improve/yolov5-AUX/models/experimental.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Experimental modules """ import math import numpy as np import torch import torch.nn as nn from utils.downloads import attempt_download class Sum(nn.Module): # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070 def __init__(self, n, weight=False): # n: number of inputs super().__init__() self.weight = weight # apply weights boolean self.iter = range(n - 1) # iter object if weight: self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True) # layer weights def forward(self, x): y = x[0] # no weight if self.weight: w = torch.sigmoid(self.w) * 2 for i in self.iter: y = y + x[i + 1] * w[i] else: for i in self.iter: y = y + x[i + 1] return y class MixConv2d(nn.Module): # Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595 def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True): # ch_in, ch_out, kernel, stride, ch_strategy super().__init__() n = len(k) # number of convolutions if equal_ch: # equal c_ per group i = torch.linspace(0, n - 1E-6, c2).floor() # c2 indices c_ = [(i == g).sum() for g in range(n)] # intermediate channels else: # equal weight.numel() per group b = [c2] + [0] * n a = np.eye(n + 1, n, k=-1) a -= np.roll(a, 1, axis=1) a *= np.array(k) ** 2 a[0] = 1 c_ = np.linalg.lstsq(a, b, rcond=None)[0].round() # solve for equal weight indices, ax = b self.m = nn.ModuleList([ nn.Conv2d(c1, int(c_), k, s, k // 2, groups=math.gcd(c1, int(c_)), bias=False) for k, c_ in zip(k, c_)]) self.bn = nn.BatchNorm2d(c2) self.act = nn.SiLU() def forward(self, x): return self.act(self.bn(torch.cat([m(x) for m in self.m], 1))) class Ensemble(nn.ModuleList): # Ensemble of models def __init__(self): super().__init__() def forward(self, x, augment=False, profile=False, visualize=False): y = [module(x, augment, profile, visualize)[0] for module in self] # y = torch.stack(y).max(0)[0] # max ensemble # y = torch.stack(y).mean(0) # mean ensemble y = torch.cat(y, 1) # nms ensemble return y, None # inference, train output def attempt_load(weights, device=None, inplace=True, fuse=True): # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a from models.yolo import Detect, Model model = Ensemble() for w in weights if isinstance(weights, list) else [weights]: ckpt = torch.load(attempt_download(w), map_location='cpu') # load ckpt = (ckpt.get('ema') or ckpt['model']).to(device).float() # FP32 model # Model compatibility updates if not hasattr(ckpt, 'stride'): ckpt.stride = torch.tensor([32.]) if hasattr(ckpt, 'names') and isinstance(ckpt.names, (list, tuple)): ckpt.names = dict(enumerate(ckpt.names)) # convert to dict model.append(ckpt.fuse().eval() if fuse and hasattr(ckpt, 'fuse') else ckpt.eval()) # model in eval mode # Module compatibility updates for m in model.modules(): t = type(m) if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Model): m.inplace = inplace # torch 1.7.0 compatibility if t is Detect and not isinstance(m.anchor_grid, list): delattr(m, 'anchor_grid') setattr(m, 'anchor_grid', [torch.zeros(1)] * m.nl) elif t is nn.Upsample and not hasattr(m, 'recompute_scale_factor'): m.recompute_scale_factor = None # torch 1.11.0 compatibility # Return model if len(model) == 1: return model[-1] # Return detection ensemble print(f'Ensemble created with {weights}\n') for k in 'names', 'nc', 'yaml': setattr(model, k, getattr(model[0], k)) model.stride = model[torch.argmax(torch.tensor([m.stride.max() for m in model])).int()].stride # max stride assert all(model[0].nc == m.nc for m in model), f'Models have different class counts: {[m.nc for m in model]}' return model ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/anchors.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Default anchors for COCO data # P5 ------------------------------------------------------------------------------------------------------------------- # P5-640: anchors_p5_640: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # P6 ------------------------------------------------------------------------------------------------------------------- # P6-640: thr=0.25: 0.9964 BPR, 5.54 anchors past thr, n=12, img_size=640, metric_all=0.281/0.716-mean/best, past_thr=0.469-mean: 9,11, 21,19, 17,41, 43,32, 39,70, 86,64, 65,131, 134,130, 120,265, 282,180, 247,354, 512,387 anchors_p6_640: - [9,11, 21,19, 17,41] # P3/8 - [43,32, 39,70, 86,64] # P4/16 - [65,131, 134,130, 120,265] # P5/32 - [282,180, 247,354, 512,387] # P6/64 # P6-1280: thr=0.25: 0.9950 BPR, 5.55 anchors past thr, n=12, img_size=1280, metric_all=0.281/0.714-mean/best, past_thr=0.468-mean: 19,27, 44,40, 38,94, 96,68, 86,152, 180,137, 140,301, 303,264, 238,542, 436,615, 739,380, 925,792 anchors_p6_1280: - [19,27, 44,40, 38,94] # P3/8 - [96,68, 86,152, 180,137] # P4/16 - [140,301, 303,264, 238,542] # P5/32 - [436,615, 739,380, 925,792] # P6/64 # P6-1920: thr=0.25: 0.9950 BPR, 5.55 anchors past thr, n=12, img_size=1920, metric_all=0.281/0.714-mean/best, past_thr=0.468-mean: 28,41, 67,59, 57,141, 144,103, 129,227, 270,205, 209,452, 455,396, 358,812, 653,922, 1109,570, 1387,1187 anchors_p6_1920: - [28,41, 67,59, 57,141] # P3/8 - [144,103, 129,227, 270,205] # P4/16 - [209,452, 455,396, 358,812] # P5/32 - [653,922, 1109,570, 1387,1187] # P6/64 # P7 ------------------------------------------------------------------------------------------------------------------- # P7-640: thr=0.25: 0.9962 BPR, 6.76 anchors past thr, n=15, img_size=640, metric_all=0.275/0.733-mean/best, past_thr=0.466-mean: 11,11, 13,30, 29,20, 30,46, 61,38, 39,92, 78,80, 146,66, 79,163, 149,150, 321,143, 157,303, 257,402, 359,290, 524,372 anchors_p7_640: - [11,11, 13,30, 29,20] # P3/8 - [30,46, 61,38, 39,92] # P4/16 - [78,80, 146,66, 79,163] # P5/32 - [149,150, 321,143, 157,303] # P6/64 - [257,402, 359,290, 524,372] # P7/128 # P7-1280: thr=0.25: 0.9968 BPR, 6.71 anchors past thr, n=15, img_size=1280, metric_all=0.273/0.732-mean/best, past_thr=0.463-mean: 19,22, 54,36, 32,77, 70,83, 138,71, 75,173, 165,159, 148,334, 375,151, 334,317, 251,626, 499,474, 750,326, 534,814, 1079,818 anchors_p7_1280: - [19,22, 54,36, 32,77] # P3/8 - [70,83, 138,71, 75,173] # P4/16 - [165,159, 148,334, 375,151] # P5/32 - [334,317, 251,626, 499,474] # P6/64 - [750,326, 534,814, 1079,818] # P7/128 # P7-1920: thr=0.25: 0.9968 BPR, 6.71 anchors past thr, n=15, img_size=1920, metric_all=0.273/0.732-mean/best, past_thr=0.463-mean: 29,34, 81,55, 47,115, 105,124, 207,107, 113,259, 247,238, 222,500, 563,227, 501,476, 376,939, 749,711, 1126,489, 801,1222, 1618,1227 anchors_p7_1920: - [29,34, 81,55, 47,115] # P3/8 - [105,124, 207,107, 113,259] # P4/16 - [247,238, 222,500, 563,227] # P5/32 - [501,476, 376,939, 749,711] # P6/64 - [1126,489, 801,1222, 1618,1227] # P7/128 ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov3-spp.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # darknet53 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Bottleneck, [64]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 2, Bottleneck, [128]], [-1, 1, Conv, [256, 3, 2]], # 5-P3/8 [-1, 8, Bottleneck, [256]], [-1, 1, Conv, [512, 3, 2]], # 7-P4/16 [-1, 8, Bottleneck, [512]], [-1, 1, Conv, [1024, 3, 2]], # 9-P5/32 [-1, 4, Bottleneck, [1024]], # 10 ] # YOLOv3-SPP head head: [[-1, 1, Bottleneck, [1024, False]], [-1, 1, SPP, [512, [5, 9, 13]]], [-1, 1, Conv, [1024, 3, 1]], [-1, 1, Conv, [512, 1, 1]], [-1, 1, Conv, [1024, 3, 1]], # 15 (P5/32-large) [-2, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P4 [-1, 1, Bottleneck, [512, False]], [-1, 1, Bottleneck, [512, False]], [-1, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [512, 3, 1]], # 22 (P4/16-medium) [-2, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P3 [-1, 1, Bottleneck, [256, False]], [-1, 2, Bottleneck, [256, False]], # 27 (P3/8-small) [[27, 22, 15], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov3-tiny.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,14, 23,27, 37,58] # P4/16 - [81,82, 135,169, 344,319] # P5/32 # YOLOv3-tiny backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [16, 3, 1]], # 0 [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 1-P1/2 [-1, 1, Conv, [32, 3, 1]], [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 3-P2/4 [-1, 1, Conv, [64, 3, 1]], [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 5-P3/8 [-1, 1, Conv, [128, 3, 1]], [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 7-P4/16 [-1, 1, Conv, [256, 3, 1]], [-1, 1, nn.MaxPool2d, [2, 2, 0]], # 9-P5/32 [-1, 1, Conv, [512, 3, 1]], [-1, 1, nn.ZeroPad2d, [[0, 1, 0, 1]]], # 11 [-1, 1, nn.MaxPool2d, [2, 1, 0]], # 12 ] # YOLOv3-tiny head head: [[-1, 1, Conv, [1024, 3, 1]], [-1, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [512, 3, 1]], # 15 (P5/32-large) [-2, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P4 [-1, 1, Conv, [256, 3, 1]], # 19 (P4/16-medium) [[19, 15], 1, Detect, [nc, anchors]], # Detect(P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov3.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # darknet53 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Bottleneck, [64]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 2, Bottleneck, [128]], [-1, 1, Conv, [256, 3, 2]], # 5-P3/8 [-1, 8, Bottleneck, [256]], [-1, 1, Conv, [512, 3, 2]], # 7-P4/16 [-1, 8, Bottleneck, [512]], [-1, 1, Conv, [1024, 3, 2]], # 9-P5/32 [-1, 4, Bottleneck, [1024]], # 10 ] # YOLOv3 head head: [[-1, 1, Bottleneck, [1024, False]], [-1, 1, Conv, [512, 1, 1]], [-1, 1, Conv, [1024, 3, 1]], [-1, 1, Conv, [512, 1, 1]], [-1, 1, Conv, [1024, 3, 1]], # 15 (P5/32-large) [-2, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P4 [-1, 1, Bottleneck, [512, False]], [-1, 1, Bottleneck, [512, False]], [-1, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [512, 3, 1]], # 22 (P4/16-medium) [-2, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P3 [-1, 1, Bottleneck, [256, False]], [-1, 2, Bottleneck, [256, False]], # 27 (P3/8-small) [[27, 22, 15], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-bifpn.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 BiFPN head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14, 6], 1, Concat, [1]], # cat P4 <--- BiFPN change [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-fpn.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 FPN head head: [[-1, 3, C3, [1024, False]], # 10 (P5/32-large) [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 1, Conv, [512, 1, 1]], [-1, 3, C3, [512, False]], # 14 (P4/16-medium) [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 1, Conv, [256, 1, 1]], [-1, 3, C3, [256, False]], # 18 (P3/8-small) [[18, 14, 10], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-p2.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: 3 # AutoAnchor evolves 3 anchors per P output layer # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head with (P2, P3, P4, P5) outputs head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 2], 1, Concat, [1]], # cat backbone P2 [-1, 1, C3, [128, False]], # 21 (P2/4-xsmall) [-1, 1, Conv, [128, 3, 2]], [[-1, 18], 1, Concat, [1]], # cat head P3 [-1, 3, C3, [256, False]], # 24 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 27 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 30 (P5/32-large) [[21, 24, 27, 30], 1, Detect, [nc, anchors]], # Detect(P2, P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-p34.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: 3 # AutoAnchor evolves 3 anchors per P output layer # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [ [ -1, 1, Conv, [ 64, 6, 2, 2 ] ], # 0-P1/2 [ -1, 1, Conv, [ 128, 3, 2 ] ], # 1-P2/4 [ -1, 3, C3, [ 128 ] ], [ -1, 1, Conv, [ 256, 3, 2 ] ], # 3-P3/8 [ -1, 6, C3, [ 256 ] ], [ -1, 1, Conv, [ 512, 3, 2 ] ], # 5-P4/16 [ -1, 9, C3, [ 512 ] ], [ -1, 1, Conv, [ 1024, 3, 2 ] ], # 7-P5/32 [ -1, 3, C3, [ 1024 ] ], [ -1, 1, SPPF, [ 1024, 5 ] ], # 9 ] # YOLOv5 v6.0 head with (P3, P4) outputs head: [ [ -1, 1, Conv, [ 512, 1, 1 ] ], [ -1, 1, nn.Upsample, [ None, 2, 'nearest' ] ], [ [ -1, 6 ], 1, Concat, [ 1 ] ], # cat backbone P4 [ -1, 3, C3, [ 512, False ] ], # 13 [ -1, 1, Conv, [ 256, 1, 1 ] ], [ -1, 1, nn.Upsample, [ None, 2, 'nearest' ] ], [ [ -1, 4 ], 1, Concat, [ 1 ] ], # cat backbone P3 [ -1, 3, C3, [ 256, False ] ], # 17 (P3/8-small) [ -1, 1, Conv, [ 256, 3, 2 ] ], [ [ -1, 14 ], 1, Concat, [ 1 ] ], # cat head P4 [ -1, 3, C3, [ 512, False ] ], # 20 (P4/16-medium) [ [ 17, 20 ], 1, Detect, [ nc, anchors ] ], # Detect(P3, P4) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-p6.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: 3 # AutoAnchor evolves 3 anchors per P output layer # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 11 ] # YOLOv5 v6.0 head with (P3, P4, P5, P6) outputs head: [[-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 15 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 19 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 23 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 20], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 26 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 16], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 29 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 12], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge) [[23, 26, 29, 32], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-p7.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: 3 # AutoAnchor evolves 3 anchors per P output layer # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, Conv, [1280, 3, 2]], # 11-P7/128 [-1, 3, C3, [1280]], [-1, 1, SPPF, [1280, 5]], # 13 ] # YOLOv5 v6.0 head with (P3, P4, P5, P6, P7) outputs head: [[-1, 1, Conv, [1024, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 10], 1, Concat, [1]], # cat backbone P6 [-1, 3, C3, [1024, False]], # 17 [-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 21 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 25 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 29 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 26], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 32 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 22], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 35 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 18], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 38 (P6/64-xlarge) [-1, 1, Conv, [1024, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P7 [-1, 3, C3, [1280, False]], # 41 (P7/128-xxlarge) [[29, 32, 35, 38, 41], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6, P7) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5-panet.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 PANet head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5l6.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [19,27, 44,40, 38,94] # P3/8 - [96,68, 86,152, 180,137] # P4/16 - [140,301, 303,264, 238,542] # P5/32 - [436,615, 739,380, 925,792] # P6/64 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 11 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 15 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 19 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 23 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 20], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 26 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 16], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 29 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 12], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge) [[23, 26, 29, 32], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5m6.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.67 # model depth multiple width_multiple: 0.75 # layer channel multiple anchors: - [19,27, 44,40, 38,94] # P3/8 - [96,68, 86,152, 180,137] # P4/16 - [140,301, 303,264, 238,542] # P5/32 - [436,615, 739,380, 925,792] # P6/64 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 11 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 15 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 19 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 23 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 20], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 26 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 16], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 29 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 12], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge) [[23, 26, 29, 32], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5n6.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [19,27, 44,40, 38,94] # P3/8 - [96,68, 86,152, 180,137] # P4/16 - [140,301, 303,264, 238,542] # P5/32 - [436,615, 739,380, 925,792] # P6/64 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 11 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 15 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 19 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 23 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 20], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 26 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 16], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 29 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 12], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge) [[23, 26, 29, 32], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5s-LeakyReLU.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes activation: nn.LeakyReLU(0.1) # <----- Conv() activation used throughout entire YOLOv5 model depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5s-ghost.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, GhostConv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3Ghost, [128]], [-1, 1, GhostConv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3Ghost, [256]], [-1, 1, GhostConv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3Ghost, [512]], [-1, 1, GhostConv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3Ghost, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, GhostConv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3Ghost, [512, False]], # 13 [-1, 1, GhostConv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3Ghost, [256, False]], # 17 (P3/8-small) [-1, 1, GhostConv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3Ghost, [512, False]], # 20 (P4/16-medium) [-1, 1, GhostConv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3Ghost, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5s-transformer.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3TR, [1024]], # 9 <--- C3TR() Transformer module [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5s6.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: - [19,27, 44,40, 38,94] # P3/8 - [96,68, 86,152, 180,137] # P4/16 - [140,301, 303,264, 238,542] # P5/32 - [436,615, 739,380, 925,792] # P6/64 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 11 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 15 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 19 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 23 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 20], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 26 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 16], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 29 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 12], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge) [[23, 26, 29, 32], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/hub/yolov5x6.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.33 # model depth multiple width_multiple: 1.25 # layer channel multiple anchors: - [19,27, 44,40, 38,94] # P3/8 - [96,68, 86,152, 180,137] # P4/16 - [140,301, 303,264, 238,542] # P5/32 - [436,615, 739,380, 925,792] # P6/64 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [768, 3, 2]], # 7-P5/32 [-1, 3, C3, [768]], [-1, 1, Conv, [1024, 3, 2]], # 9-P6/64 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 11 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [768, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 8], 1, Concat, [1]], # cat backbone P5 [-1, 3, C3, [768, False]], # 15 [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 19 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 23 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 20], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 26 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 16], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [768, False]], # 29 (P5/32-large) [-1, 1, Conv, [768, 3, 2]], [[-1, 12], 1, Concat, [1]], # cat head P6 [-1, 3, C3, [1024, False]], # 32 (P6/64-xlarge) [[23, 26, 29, 32], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/segment/yolov5l-seg.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Segment, [nc, anchors, 32, 256]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/segment/yolov5m-seg.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.67 # model depth multiple width_multiple: 0.75 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Segment, [nc, anchors, 32, 256]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/segment/yolov5n-seg.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Segment, [nc, anchors, 32, 256]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/segment/yolov5s-seg.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.5 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Segment, [nc, anchors, 32, 256]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/segment/yolov5x-seg.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.33 # model depth multiple width_multiple: 1.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Segment, [nc, anchors, 32, 256]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/tf.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ TensorFlow, Keras and TFLite versions of YOLOv5 Authored by https://github.com/zldrobit in PR https://github.com/ultralytics/yolov5/pull/1127 Usage: $ python models/tf.py --weights yolov5s.pt Export: $ python export.py --weights yolov5s.pt --include saved_model pb tflite tfjs """ import argparse import sys from copy import deepcopy from pathlib import Path FILE = Path(__file__).resolve() ROOT = FILE.parents[1] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH # ROOT = ROOT.relative_to(Path.cwd()) # relative import numpy as np import tensorflow as tf import torch import torch.nn as nn from tensorflow import keras from models.common import (C3, SPP, SPPF, Bottleneck, BottleneckCSP, C3x, Concat, Conv, CrossConv, DWConv, DWConvTranspose2d, Focus, autopad) from models.experimental import MixConv2d, attempt_load from models.yolo import Detect, Segment from utils.activations import SiLU from utils.general import LOGGER, make_divisible, print_args class TFBN(keras.layers.Layer): # TensorFlow BatchNormalization wrapper def __init__(self, w=None): super().__init__() self.bn = keras.layers.BatchNormalization( beta_initializer=keras.initializers.Constant(w.bias.numpy()), gamma_initializer=keras.initializers.Constant(w.weight.numpy()), moving_mean_initializer=keras.initializers.Constant(w.running_mean.numpy()), moving_variance_initializer=keras.initializers.Constant(w.running_var.numpy()), epsilon=w.eps) def call(self, inputs): return self.bn(inputs) class TFPad(keras.layers.Layer): # Pad inputs in spatial dimensions 1 and 2 def __init__(self, pad): super().__init__() if isinstance(pad, int): self.pad = tf.constant([[0, 0], [pad, pad], [pad, pad], [0, 0]]) else: # tuple/list self.pad = tf.constant([[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]]) def call(self, inputs): return tf.pad(inputs, self.pad, mode='constant', constant_values=0) class TFConv(keras.layers.Layer): # Standard convolution def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None): # ch_in, ch_out, weights, kernel, stride, padding, groups super().__init__() assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument" # TensorFlow convolution padding is inconsistent with PyTorch (e.g. k=3 s=2 'SAME' padding) # see https://stackoverflow.com/questions/52975843/comparing-conv2d-with-padding-between-tensorflow-and-pytorch conv = keras.layers.Conv2D( filters=c2, kernel_size=k, strides=s, padding='SAME' if s == 1 else 'VALID', use_bias=not hasattr(w, 'bn'), kernel_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()), bias_initializer='zeros' if hasattr(w, 'bn') else keras.initializers.Constant(w.conv.bias.numpy())) self.conv = conv if s == 1 else keras.Sequential([TFPad(autopad(k, p)), conv]) self.bn = TFBN(w.bn) if hasattr(w, 'bn') else tf.identity self.act = activations(w.act) if act else tf.identity def call(self, inputs): return self.act(self.bn(self.conv(inputs))) class TFDWConv(keras.layers.Layer): # Depthwise convolution def __init__(self, c1, c2, k=1, s=1, p=None, act=True, w=None): # ch_in, ch_out, weights, kernel, stride, padding, groups super().__init__() assert c2 % c1 == 0, f'TFDWConv() output={c2} must be a multiple of input={c1} channels' conv = keras.layers.DepthwiseConv2D( kernel_size=k, depth_multiplier=c2 // c1, strides=s, padding='SAME' if s == 1 else 'VALID', use_bias=not hasattr(w, 'bn'), depthwise_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()), bias_initializer='zeros' if hasattr(w, 'bn') else keras.initializers.Constant(w.conv.bias.numpy())) self.conv = conv if s == 1 else keras.Sequential([TFPad(autopad(k, p)), conv]) self.bn = TFBN(w.bn) if hasattr(w, 'bn') else tf.identity self.act = activations(w.act) if act else tf.identity def call(self, inputs): return self.act(self.bn(self.conv(inputs))) class TFDWConvTranspose2d(keras.layers.Layer): # Depthwise ConvTranspose2d def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0, w=None): # ch_in, ch_out, weights, kernel, stride, padding, groups super().__init__() assert c1 == c2, f'TFDWConv() output={c2} must be equal to input={c1} channels' assert k == 4 and p1 == 1, 'TFDWConv() only valid for k=4 and p1=1' weight, bias = w.weight.permute(2, 3, 1, 0).numpy(), w.bias.numpy() self.c1 = c1 self.conv = [ keras.layers.Conv2DTranspose(filters=1, kernel_size=k, strides=s, padding='VALID', output_padding=p2, use_bias=True, kernel_initializer=keras.initializers.Constant(weight[..., i:i + 1]), bias_initializer=keras.initializers.Constant(bias[i])) for i in range(c1)] def call(self, inputs): return tf.concat([m(x) for m, x in zip(self.conv, tf.split(inputs, self.c1, 3))], 3)[:, 1:-1, 1:-1] class TFFocus(keras.layers.Layer): # Focus wh information into c-space def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None): # ch_in, ch_out, kernel, stride, padding, groups super().__init__() self.conv = TFConv(c1 * 4, c2, k, s, p, g, act, w.conv) def call(self, inputs): # x(b,w,h,c) -> y(b,w/2,h/2,4c) # inputs = inputs / 255 # normalize 0-255 to 0-1 inputs = [inputs[:, ::2, ::2, :], inputs[:, 1::2, ::2, :], inputs[:, ::2, 1::2, :], inputs[:, 1::2, 1::2, :]] return self.conv(tf.concat(inputs, 3)) class TFBottleneck(keras.layers.Layer): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, w=None): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) self.cv2 = TFConv(c_, c2, 3, 1, g=g, w=w.cv2) self.add = shortcut and c1 == c2 def call(self, inputs): return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs)) class TFCrossConv(keras.layers.Layer): # Cross Convolution def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False, w=None): super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = TFConv(c1, c_, (1, k), (1, s), w=w.cv1) self.cv2 = TFConv(c_, c2, (k, 1), (s, 1), g=g, w=w.cv2) self.add = shortcut and c1 == c2 def call(self, inputs): return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs)) class TFConv2d(keras.layers.Layer): # Substitution for PyTorch nn.Conv2D def __init__(self, c1, c2, k, s=1, g=1, bias=True, w=None): super().__init__() assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument" self.conv = keras.layers.Conv2D(filters=c2, kernel_size=k, strides=s, padding='VALID', use_bias=bias, kernel_initializer=keras.initializers.Constant( w.weight.permute(2, 3, 1, 0).numpy()), bias_initializer=keras.initializers.Constant(w.bias.numpy()) if bias else None) def call(self, inputs): return self.conv(inputs) class TFBottleneckCSP(keras.layers.Layer): # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) self.cv2 = TFConv2d(c1, c_, 1, 1, bias=False, w=w.cv2) self.cv3 = TFConv2d(c_, c_, 1, 1, bias=False, w=w.cv3) self.cv4 = TFConv(2 * c_, c2, 1, 1, w=w.cv4) self.bn = TFBN(w.bn) self.act = lambda x: keras.activations.swish(x) self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)]) def call(self, inputs): y1 = self.cv3(self.m(self.cv1(inputs))) y2 = self.cv2(inputs) return self.cv4(self.act(self.bn(tf.concat((y1, y2), axis=3)))) class TFC3(keras.layers.Layer): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) self.cv2 = TFConv(c1, c_, 1, 1, w=w.cv2) self.cv3 = TFConv(2 * c_, c2, 1, 1, w=w.cv3) self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)]) def call(self, inputs): return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3)) class TFC3x(keras.layers.Layer): # 3 module with cross-convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) self.cv2 = TFConv(c1, c_, 1, 1, w=w.cv2) self.cv3 = TFConv(2 * c_, c2, 1, 1, w=w.cv3) self.m = keras.Sequential([ TFCrossConv(c_, c_, k=3, s=1, g=g, e=1.0, shortcut=shortcut, w=w.m[j]) for j in range(n)]) def call(self, inputs): return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3)) class TFSPP(keras.layers.Layer): # Spatial pyramid pooling layer used in YOLOv3-SPP def __init__(self, c1, c2, k=(5, 9, 13), w=None): super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) self.cv2 = TFConv(c_ * (len(k) + 1), c2, 1, 1, w=w.cv2) self.m = [keras.layers.MaxPool2D(pool_size=x, strides=1, padding='SAME') for x in k] def call(self, inputs): x = self.cv1(inputs) return self.cv2(tf.concat([x] + [m(x) for m in self.m], 3)) class TFSPPF(keras.layers.Layer): # Spatial pyramid pooling-Fast layer def __init__(self, c1, c2, k=5, w=None): super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) self.cv2 = TFConv(c_ * 4, c2, 1, 1, w=w.cv2) self.m = keras.layers.MaxPool2D(pool_size=k, strides=1, padding='SAME') def call(self, inputs): x = self.cv1(inputs) y1 = self.m(x) y2 = self.m(y1) return self.cv2(tf.concat([x, y1, y2, self.m(y2)], 3)) class TFDetect(keras.layers.Layer): # TF YOLOv5 Detect layer def __init__(self, nc=80, anchors=(), ch=(), imgsz=(640, 640), w=None): # detection layer super().__init__() self.stride = tf.convert_to_tensor(w.stride.numpy(), dtype=tf.float32) self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [tf.zeros(1)] * self.nl # init grid self.anchors = tf.convert_to_tensor(w.anchors.numpy(), dtype=tf.float32) self.anchor_grid = tf.reshape(self.anchors * tf.reshape(self.stride, [self.nl, 1, 1]), [self.nl, 1, -1, 1, 2]) self.m = [TFConv2d(x, self.no * self.na, 1, w=w.m[i]) for i, x in enumerate(ch)] self.training = False # set to False after building model self.imgsz = imgsz for i in range(self.nl): ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i] self.grid[i] = self._make_grid(nx, ny) def call(self, inputs): z = [] # inference output x = [] for i in range(self.nl): x.append(self.m[i](inputs[i])) # x(bs,20,20,255) to x(bs,3,20,20,85) ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i] x[i] = tf.reshape(x[i], [-1, ny * nx, self.na, self.no]) if not self.training: # inference y = x[i] grid = tf.transpose(self.grid[i], [0, 2, 1, 3]) - 0.5 anchor_grid = tf.transpose(self.anchor_grid[i], [0, 2, 1, 3]) * 4 xy = (tf.sigmoid(y[..., 0:2]) * 2 + grid) * self.stride[i] # xy wh = tf.sigmoid(y[..., 2:4]) ** 2 * anchor_grid # Normalize xywh to 0-1 to reduce calibration error xy /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32) wh /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32) y = tf.concat([xy, wh, tf.sigmoid(y[..., 4:5 + self.nc]), y[..., 5 + self.nc:]], -1) z.append(tf.reshape(y, [-1, self.na * ny * nx, self.no])) return tf.transpose(x, [0, 2, 1, 3]) if self.training else (tf.concat(z, 1),) @staticmethod def _make_grid(nx=20, ny=20): # yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) # return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() xv, yv = tf.meshgrid(tf.range(nx), tf.range(ny)) return tf.cast(tf.reshape(tf.stack([xv, yv], 2), [1, 1, ny * nx, 2]), dtype=tf.float32) class TFSegment(TFDetect): # YOLOv5 Segment head for segmentation models def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), imgsz=(640, 640), w=None): super().__init__(nc, anchors, ch, imgsz, w) self.nm = nm # number of masks self.npr = npr # number of protos self.no = 5 + nc + self.nm # number of outputs per anchor self.m = [TFConv2d(x, self.no * self.na, 1, w=w.m[i]) for i, x in enumerate(ch)] # output conv self.proto = TFProto(ch[0], self.npr, self.nm, w=w.proto) # protos self.detect = TFDetect.call def call(self, x): p = self.proto(x[0]) # p = TFUpsample(None, scale_factor=4, mode='nearest')(self.proto(x[0])) # (optional) full-size protos p = tf.transpose(p, [0, 3, 1, 2]) # from shape(1,160,160,32) to shape(1,32,160,160) x = self.detect(self, x) return (x, p) if self.training else (x[0], p) class TFProto(keras.layers.Layer): def __init__(self, c1, c_=256, c2=32, w=None): super().__init__() self.cv1 = TFConv(c1, c_, k=3, w=w.cv1) self.upsample = TFUpsample(None, scale_factor=2, mode='nearest') self.cv2 = TFConv(c_, c_, k=3, w=w.cv2) self.cv3 = TFConv(c_, c2, w=w.cv3) def call(self, inputs): return self.cv3(self.cv2(self.upsample(self.cv1(inputs)))) class TFUpsample(keras.layers.Layer): # TF version of torch.nn.Upsample() def __init__(self, size, scale_factor, mode, w=None): # warning: all arguments needed including 'w' super().__init__() assert scale_factor % 2 == 0, 'scale_factor must be multiple of 2' self.upsample = lambda x: tf.image.resize(x, (x.shape[1] * scale_factor, x.shape[2] * scale_factor), mode) # self.upsample = keras.layers.UpSampling2D(size=scale_factor, interpolation=mode) # with default arguments: align_corners=False, half_pixel_centers=False # self.upsample = lambda x: tf.raw_ops.ResizeNearestNeighbor(images=x, # size=(x.shape[1] * 2, x.shape[2] * 2)) def call(self, inputs): return self.upsample(inputs) class TFConcat(keras.layers.Layer): # TF version of torch.concat() def __init__(self, dimension=1, w=None): super().__init__() assert dimension == 1, 'convert only NCHW to NHWC concat' self.d = 3 def call(self, inputs): return tf.concat(inputs, self.d) def parse_model(d, ch, model, imgsz): # model_dict, input_channels(3) LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}") anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'] na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors no = na * (nc + 5) # number of outputs = anchors * (classes + 5) layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args m_str = m m = eval(m) if isinstance(m, str) else m # eval strings for j, a in enumerate(args): try: args[j] = eval(a) if isinstance(a, str) else a # eval strings except NameError: pass n = max(round(n * gd), 1) if n > 1 else n # depth gain if m in [ nn.Conv2d, Conv, DWConv, DWConvTranspose2d, Bottleneck, SPP, SPPF, MixConv2d, Focus, CrossConv, BottleneckCSP, C3, C3x]: c1, c2 = ch[f], args[0] c2 = make_divisible(c2 * gw, 8) if c2 != no else c2 args = [c1, c2, *args[1:]] if m in [BottleneckCSP, C3, C3x]: args.insert(2, n) n = 1 elif m is nn.BatchNorm2d: args = [ch[f]] elif m is Concat: c2 = sum(ch[-1 if x == -1 else x + 1] for x in f) elif m in [Detect, Segment]: args.append([ch[x + 1] for x in f]) if isinstance(args[1], int): # number of anchors args[1] = [list(range(args[1] * 2))] * len(f) if m is Segment: args[3] = make_divisible(args[3] * gw, 8) args.append(imgsz) else: c2 = ch[f] tf_m = eval('TF' + m_str.replace('nn.', '')) m_ = keras.Sequential([tf_m(*args, w=model.model[i][j]) for j in range(n)]) if n > 1 \ else tf_m(*args, w=model.model[i]) # module torch_m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module t = str(m)[8:-2].replace('__main__.', '') # module type np = sum(x.numel() for x in torch_m_.parameters()) # number params m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params LOGGER.info(f'{i:>3}{str(f):>18}{str(n):>3}{np:>10} {t:<40}{str(args):<30}') # print save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist layers.append(m_) ch.append(c2) return keras.Sequential(layers), sorted(save) class TFModel: # TF YOLOv5 model def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, model=None, imgsz=(640, 640)): # model, channels, classes super().__init__() if isinstance(cfg, dict): self.yaml = cfg # model dict else: # is *.yaml import yaml # for torch hub self.yaml_file = Path(cfg).name with open(cfg) as f: self.yaml = yaml.load(f, Loader=yaml.FullLoader) # model dict # Define model if nc and nc != self.yaml['nc']: LOGGER.info(f"Overriding {cfg} nc={self.yaml['nc']} with nc={nc}") self.yaml['nc'] = nc # override yaml value self.model, self.savelist = parse_model(deepcopy(self.yaml), ch=[ch], model=model, imgsz=imgsz) def predict(self, inputs, tf_nms=False, agnostic_nms=False, topk_per_class=100, topk_all=100, iou_thres=0.45, conf_thres=0.25): y = [] # outputs x = inputs for m in self.model.layers: if m.f != -1: # if not from previous layer x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers x = m(x) # run y.append(x if m.i in self.savelist else None) # save output # Add TensorFlow NMS if tf_nms: boxes = self._xywh2xyxy(x[0][..., :4]) probs = x[0][:, :, 4:5] classes = x[0][:, :, 5:] scores = probs * classes if agnostic_nms: nms = AgnosticNMS()((boxes, classes, scores), topk_all, iou_thres, conf_thres) else: boxes = tf.expand_dims(boxes, 2) nms = tf.image.combined_non_max_suppression(boxes, scores, topk_per_class, topk_all, iou_thres, conf_thres, clip_boxes=False) return (nms,) return x # output [1,6300,85] = [xywh, conf, class0, class1, ...] # x = x[0] # [x(1,6300,85), ...] to x(6300,85) # xywh = x[..., :4] # x(6300,4) boxes # conf = x[..., 4:5] # x(6300,1) confidences # cls = tf.reshape(tf.cast(tf.argmax(x[..., 5:], axis=1), tf.float32), (-1, 1)) # x(6300,1) classes # return tf.concat([conf, cls, xywh], 1) @staticmethod def _xywh2xyxy(xywh): # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right x, y, w, h = tf.split(xywh, num_or_size_splits=4, axis=-1) return tf.concat([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=-1) class AgnosticNMS(keras.layers.Layer): # TF Agnostic NMS def call(self, input, topk_all, iou_thres, conf_thres): # wrap map_fn to avoid TypeSpec related error https://stackoverflow.com/a/65809989/3036450 return tf.map_fn(lambda x: self._nms(x, topk_all, iou_thres, conf_thres), input, fn_output_signature=(tf.float32, tf.float32, tf.float32, tf.int32), name='agnostic_nms') @staticmethod def _nms(x, topk_all=100, iou_thres=0.45, conf_thres=0.25): # agnostic NMS boxes, classes, scores = x class_inds = tf.cast(tf.argmax(classes, axis=-1), tf.float32) scores_inp = tf.reduce_max(scores, -1) selected_inds = tf.image.non_max_suppression(boxes, scores_inp, max_output_size=topk_all, iou_threshold=iou_thres, score_threshold=conf_thres) selected_boxes = tf.gather(boxes, selected_inds) padded_boxes = tf.pad(selected_boxes, paddings=[[0, topk_all - tf.shape(selected_boxes)[0]], [0, 0]], mode='CONSTANT', constant_values=0.0) selected_scores = tf.gather(scores_inp, selected_inds) padded_scores = tf.pad(selected_scores, paddings=[[0, topk_all - tf.shape(selected_boxes)[0]]], mode='CONSTANT', constant_values=-1.0) selected_classes = tf.gather(class_inds, selected_inds) padded_classes = tf.pad(selected_classes, paddings=[[0, topk_all - tf.shape(selected_boxes)[0]]], mode='CONSTANT', constant_values=-1.0) valid_detections = tf.shape(selected_inds)[0] return padded_boxes, padded_scores, padded_classes, valid_detections def activations(act=nn.SiLU): # Returns TF activation from input PyTorch activation if isinstance(act, nn.LeakyReLU): return lambda x: keras.activations.relu(x, alpha=0.1) elif isinstance(act, nn.Hardswish): return lambda x: x * tf.nn.relu6(x + 3) * 0.166666667 elif isinstance(act, (nn.SiLU, SiLU)): return lambda x: keras.activations.swish(x) else: raise Exception(f'no matching TensorFlow activation found for PyTorch activation {act}') def representative_dataset_gen(dataset, ncalib=100): # Representative dataset generator for use with converter.representative_dataset, returns a generator of np arrays for n, (path, img, im0s, vid_cap, string) in enumerate(dataset): im = np.transpose(img, [1, 2, 0]) im = np.expand_dims(im, axis=0).astype(np.float32) im /= 255 yield [im] if n >= ncalib: break def run( weights=ROOT / 'yolov5s.pt', # weights path imgsz=(640, 640), # inference size h,w batch_size=1, # batch size dynamic=False, # dynamic batch size ): # PyTorch model im = torch.zeros((batch_size, 3, *imgsz)) # BCHW image model = attempt_load(weights, device=torch.device('cpu'), inplace=True, fuse=False) _ = model(im) # inference model.info() # TensorFlow model im = tf.zeros((batch_size, *imgsz, 3)) # BHWC image tf_model = TFModel(cfg=model.yaml, model=model, nc=model.nc, imgsz=imgsz) _ = tf_model.predict(im) # inference # Keras model im = keras.Input(shape=(*imgsz, 3), batch_size=None if dynamic else batch_size) keras_model = keras.Model(inputs=im, outputs=tf_model.predict(im)) keras_model.summary() LOGGER.info('PyTorch, TensorFlow and Keras models successfully verified.\nUse export.py for TF model export.') def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='weights path') parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w') parser.add_argument('--batch-size', type=int, default=1, help='batch size') parser.add_argument('--dynamic', action='store_true', help='dynamic batch size') opt = parser.parse_args() opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand print_args(vars(opt)) return opt def main(opt): run(**vars(opt)) if __name__ == '__main__': opt = parse_opt() main(opt) ================================================ FILE: yolo-improve/yolov5-AUX/models/yolo.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ YOLO-specific modules Usage: $ python models/yolo.py --cfg yolov5s.yaml """ import argparse import contextlib import os import platform import sys from copy import deepcopy from pathlib import Path FILE = Path(__file__).resolve() ROOT = FILE.parents[1] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH if platform.system() != 'Windows': ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative from models.common import * from models.experimental import * from utils.autoanchor import check_anchor_order from utils.general import LOGGER, check_version, check_yaml, make_divisible, print_args from utils.plots import feature_visualization from utils.torch_utils import (fuse_conv_and_bn, initialize_weights, model_info, profile, scale_img, select_device, time_sync) try: import thop # for FLOPs computation except ImportError: thop = None class Detect(nn.Module): # YOLOv5 Detect head for detection models stride = None # strides computed during build dynamic = False # force grid reconstruction export = False # export mode def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer super().__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.empty(0) for _ in range(self.nl)] # init grid self.anchor_grid = [torch.empty(0) for _ in range(self.nl)] # init anchor grid self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2) self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch[:self.nl]) # output conv self.m2 = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch[self.nl:]) # output conv self.inplace = inplace # use inplace ops (e.g. slice assignment) def forward(self, x): z = [] # inference output for i in range(self.nl): x[i] = self.m[i](x[i]) # conv bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i + self.nl] = self.m2[i](x[i + self.nl]) # conv bs, _, ny, nx = x[i + self.nl].shape x[i + self.nl] = x[i + self.nl].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() if not self.training: # inference if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i) if isinstance(self, Segment): # (boxes + masks) xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4) xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf.sigmoid(), mask), 4) else: # Detect (boxes only) xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4) xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, self.na * nx * ny, self.no)) return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x[:self.nl]) def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')): d = self.anchors[i].device t = self.anchors[i].dtype shape = 1, self.na, ny, nx, 2 # grid shape y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t) yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x) # torch>=0.7 compatibility grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5 anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape) return grid, anchor_grid class Segment(Detect): # YOLOv5 Segment head for segmentation models def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True): super().__init__(nc, anchors, ch, inplace) self.nm = nm # number of masks self.npr = npr # number of protos self.no = 5 + nc + self.nm # number of outputs per anchor self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv self.proto = Proto(ch[0], self.npr, self.nm) # protos self.detect = Detect.forward def forward(self, x): p = self.proto(x[0]) x = self.detect(self, x) return (x, p) if self.training else (x[0], p) if self.export else (x[0], p, x[1]) class BaseModel(nn.Module): # YOLOv5 base model def forward(self, x, profile=False, visualize=False): return self._forward_once(x, profile, visualize) # single-scale inference, train def _forward_once(self, x, profile=False, visualize=False): y, dt = [], [] # outputs for m in self.model: if m.f != -1: # if not from previous layer x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers if profile: self._profile_one_layer(m, x, dt) x = m(x) # run y.append(x if m.i in self.save else None) # save output if visualize: feature_visualization(x, m.type, m.i, save_dir=visualize) return x def _profile_one_layer(self, m, x, dt): c = m == self.model[-1] # is final layer, copy input as inplace fix o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPs t = time_sync() for _ in range(10): m(x.copy() if c else x) dt.append((time_sync() - t) * 100) if m == self.model[0]: LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s} module") LOGGER.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f} {m.type}') if c: LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total") def fuse(self): # fuse model Conv2d() + BatchNorm2d() layers LOGGER.info('Fusing layers... ') for m in self.model.modules(): if isinstance(m, (Conv, DWConv)) and hasattr(m, 'bn'): m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv delattr(m, 'bn') # remove batchnorm m.forward = m.forward_fuse # update forward self.info() return self def info(self, verbose=False, img_size=640): # print model information model_info(self, verbose, img_size) def _apply(self, fn): # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers self = super()._apply(fn) m = self.model[-1] # Detect() if isinstance(m, (Detect, Segment)): m.stride = fn(m.stride) m.grid = list(map(fn, m.grid)) if isinstance(m.anchor_grid, list): m.anchor_grid = list(map(fn, m.anchor_grid)) return self class DetectionModel(BaseModel): # YOLOv5 detection model def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classes super().__init__() if isinstance(cfg, dict): self.yaml = cfg # model dict else: # is *.yaml import yaml # for torch hub self.yaml_file = Path(cfg).name with open(cfg, encoding='ascii', errors='ignore') as f: self.yaml = yaml.safe_load(f) # model dict # Define model ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels if nc and nc != self.yaml['nc']: LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}") self.yaml['nc'] = nc # override yaml value if anchors: LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}') self.yaml['anchors'] = round(anchors) # override yaml value self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch]) # model, savelist self.names = [str(i) for i in range(self.yaml['nc'])] # default names self.inplace = self.yaml.get('inplace', True) # Build strides, anchors m = self.model[-1] # Detect() if isinstance(m, (Detect, Segment)): s = 256 # 2x min stride m.inplace = self.inplace forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x) m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))][:3]) # forward check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_biases() # only run once # Init weights, biases initialize_weights(self) self.info() LOGGER.info('') def forward(self, x, augment=False, profile=False, visualize=False): if augment: return self._forward_augment(x) # augmented inference, None return self._forward_once(x, profile, visualize) # single-scale inference, train def _forward_augment(self, x): img_size = x.shape[-2:] # height, width s = [1, 0.83, 0.67] # scales f = [None, 3, None] # flips (2-ud, 3-lr) y = [] # outputs for si, fi in zip(s, f): xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max())) yi = self._forward_once(xi)[0] # forward # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save yi = self._descale_pred(yi, fi, si, img_size) y.append(yi) y = self._clip_augmented(y) # clip augmented tails return torch.cat(y, 1), None # augmented inference, train def _descale_pred(self, p, flips, scale, img_size): # de-scale predictions following augmented inference (inverse operation) if self.inplace: p[..., :4] /= scale # de-scale if flips == 2: p[..., 1] = img_size[0] - p[..., 1] # de-flip ud elif flips == 3: p[..., 0] = img_size[1] - p[..., 0] # de-flip lr else: x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale # de-scale if flips == 2: y = img_size[0] - y # de-flip ud elif flips == 3: x = img_size[1] - x # de-flip lr p = torch.cat((x, y, wh, p[..., 4:]), -1) return p def _clip_augmented(self, y): # Clip YOLOv5 augmented inference tails nl = self.model[-1].nl # number of detection layers (P3-P5) g = sum(4 ** x for x in range(nl)) # grid points e = 1 # exclude layer count i = (y[0].shape[1] // g) * sum(4 ** x for x in range(e)) # indices y[0] = y[0][:, :-i] # large i = (y[-1].shape[1] // g) * sum(4 ** (nl - 1 - x) for x in range(e)) # indices y[-1] = y[-1][:, i:] # small return y def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency # https://arxiv.org/abs/1708.02002 section 3.3 # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1. m = self.model[-1] # Detect() module for mi, s in zip(m.m, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) b.data[:, 5:5 + m.nc] += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) Model = DetectionModel # retain YOLOv5 'Model' class for backwards compatibility class SegmentationModel(DetectionModel): # YOLOv5 segmentation model def __init__(self, cfg='yolov5s-seg.yaml', ch=3, nc=None, anchors=None): super().__init__(cfg, ch, nc, anchors) class ClassificationModel(BaseModel): # YOLOv5 classification model def __init__(self, cfg=None, model=None, nc=1000, cutoff=10): # yaml, model, number of classes, cutoff index super().__init__() self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg) def _from_detection_model(self, model, nc=1000, cutoff=10): # Create a YOLOv5 classification model from a YOLOv5 detection model if isinstance(model, DetectMultiBackend): model = model.model # unwrap DetectMultiBackend model.model = model.model[:cutoff] # backbone m = model.model[-1] # last layer ch = m.conv.in_channels if hasattr(m, 'conv') else m.cv1.conv.in_channels # ch into module c = Classify(ch, nc) # Classify() c.i, c.f, c.type = m.i, m.f, 'models.common.Classify' # index, from, type model.model[-1] = c # replace self.model = model.model self.stride = model.stride self.save = [] self.nc = nc def _from_yaml(self, cfg): # Create a YOLOv5 classification model from a *.yaml file self.model = None def parse_model(d, ch): # model_dict, input_channels(3) # Parse a YOLOv5 model.yaml dictionary LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}") anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation') if act: Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU() LOGGER.info(f"{colorstr('activation:')} {act}") # print na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors no = na * (nc + 5) # number of outputs = anchors * (classes + 5) layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args m = eval(m) if isinstance(m, str) else m # eval strings for j, a in enumerate(args): with contextlib.suppress(NameError): args[j] = eval(a) if isinstance(a, str) else a # eval strings n = n_ = max(round(n * gd), 1) if n > 1 else n # depth gain if m in { Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x}: c1, c2 = ch[f], args[0] if c2 != no: # if not output c2 = make_divisible(c2 * gw, 8) args = [c1, c2, *args[1:]] if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}: args.insert(2, n) # number of repeats n = 1 elif m is nn.BatchNorm2d: args = [ch[f]] elif m is Concat: c2 = sum(ch[x] for x in f) # TODO: channel, gw, gd elif m in {Detect, Segment}: args.append([ch[x] for x in f]) if isinstance(args[1], int): # number of anchors args[1] = [list(range(args[1] * 2))] * len(f) if m is Segment: args[3] = make_divisible(args[3] * gw, 8) elif m is Contract: c2 = ch[f] * args[0] ** 2 elif m is Expand: c2 = ch[f] // args[0] ** 2 else: c2 = ch[f] m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module t = str(m)[8:-2].replace('__main__.', '') # module type np = sum(x.numel() for x in m_.parameters()) # number params m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f} {t:<40}{str(args):<30}') # print save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist layers.append(m_) if i == 0: ch = [] ch.append(c2) return nn.Sequential(*layers), sorted(save) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml') parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--profile', action='store_true', help='profile model speed') parser.add_argument('--line-profile', action='store_true', help='profile model speed layer by layer') parser.add_argument('--test', action='store_true', help='test all yolo*.yaml') opt = parser.parse_args() opt.cfg = check_yaml(opt.cfg) # check YAML print_args(vars(opt)) device = select_device(opt.device) # Create model im = torch.rand(opt.batch_size, 3, 640, 640).to(device) model = Model(opt.cfg).to(device) # Options if opt.line_profile: # profile layer by layer model(im, profile=True) elif opt.profile: # profile forward-backward results = profile(input=im, ops=[model], n=3) elif opt.test: # test all models for cfg in Path(ROOT / 'models').rglob('yolo*.yaml'): try: _ = Model(cfg) except Exception as e: print(f'Error in {cfg}: {e}') else: # report fused model summary model.fuse() ================================================ FILE: yolo-improve/yolov5-AUX/models/yolov5_aux.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [17, 1, Conv, [256, 3, 1]], # 24 [13, 1, Conv, [512, 3, 1]], # 25 [9, 1, Conv, [1024, 3, 1]], # 26 [[17, 20, 23, 24, 25, 26], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/yolov5l.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/yolov5m.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.67 # model depth multiple width_multiple: 0.75 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/yolov5n.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/yolov5s.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/models/yolov5x.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 1.33 # model depth multiple width_multiple: 1.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-AUX/train.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Train a YOLOv5 model on a custom dataset. Models and datasets download automatically from the latest YOLOv5 release. Usage - Single-GPU training: $ python train.py --data coco128.yaml --weights yolov5s.pt --img 640 # from pretrained (recommended) $ python train.py --data coco128.yaml --weights '' --cfg yolov5s.yaml --img 640 # from scratch Usage - Multi-GPU DDP training: $ python -m torch.distributed.run --nproc_per_node 4 --master_port 1 train.py --data coco128.yaml --weights yolov5s.pt --img 640 --device 0,1,2,3 Models: https://github.com/ultralytics/yolov5/tree/master/models Datasets: https://github.com/ultralytics/yolov5/tree/master/data Tutorial: https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data """ import argparse import math import os import random import subprocess import sys import time from copy import deepcopy from datetime import datetime from pathlib import Path import numpy as np import torch import torch.distributed as dist import torch.nn as nn import yaml from torch.optim import lr_scheduler from tqdm import tqdm FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative import val as validate # for end-of-epoch mAP from models.experimental import attempt_load from models.yolo import Model from utils.autoanchor import check_anchors from utils.autobatch import check_train_batch_size from utils.callbacks import Callbacks from utils.dataloaders import create_dataloader from utils.downloads import attempt_download, is_url from utils.general import (LOGGER, TQDM_BAR_FORMAT, check_amp, check_dataset, check_file, check_git_info, check_git_status, check_img_size, check_requirements, check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer, yaml_save) from utils.loggers import Loggers from utils.loggers.comet.comet_utils import check_comet_resume from utils.loss import ComputeLossAuxOTA, ComputeLoss from utils.metrics import fitness from utils.plots import plot_evolve from utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, select_device, smart_DDP, smart_optimizer, smart_resume, torch_distributed_zero_first) LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) GIT_INFO = check_git_info() def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze callbacks.run('on_pretrain_routine_start') # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) opt.hyp = hyp.copy() # for saving hyps to checkpoints # Save run settings if not evolve: yaml_save(save_dir / 'hyp.yaml', hyp) yaml_save(save_dir / 'opt.yaml', vars(opt)) # Loggers data_dict = None if RANK in {-1, 0}: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Process custom dataset artifact link data_dict = loggers.remote_dataset if resume: # If resuming runs from remote artifact weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' init_seeds(opt.seed + 1 + RANK, deterministic=True) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = {0: 'item'} if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location='cpu') # load checkpoint to CPU to avoid CUDA memory leak model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create amp = check_amp(model) # check AMP # Freeze freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers # v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results) if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz, amp) loggers.on_params_update({'batch_size': batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay optimizer = smart_optimizer(model, opt.optimizer, hyp['lr0'], hyp['momentum'], hyp['weight_decay']) # Scheduler if opt.cos_lr: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] else: lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in {-1, 0} else None # Resume best_fitness, start_epoch = 0.0, 0 if pretrained: if resume: best_fitness, start_epoch, epochs = smart_resume(ckpt, optimizer, ema, weights, epochs, resume) del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning('WARNING ⚠️ DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.') model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=None if opt.cache == 'val' else opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True, seed=opt.seed) labels = np.concatenate(dataset.labels, 0) mlc = int(labels[:, 0].max()) # max label class assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in {-1, 0}: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers * 2, pad=0.5, prefix=colorstr('val: '))[0] if not resume: if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # run AutoAnchor model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end', labels, names) # DDP mode if cuda and RANK != -1: model = smart_DDP(model) # Model attributes nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nb = len(train_loader) # number of batches nw = max(round(hyp['warmup_epochs'] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = torch.cuda.amp.GradScaler(enabled=amp) stopper, stop = EarlyStopping(patience=opt.patience), False compute_loss_ota = ComputeLossAuxOTA(model) # init loss class compute_loss = ComputeLoss(model) callbacks.run('on_train_start') LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ callbacks.run('on_train_epoch_start') model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info(('\n' + '%11s' * 7) % ('Epoch', 'GPU_mem', 'box_loss', 'obj_loss', 'cls_loss', 'Instances', 'Size')) if RANK in {-1, 0}: pbar = tqdm(pbar, total=nb, bar_format=TQDM_BAR_FORMAT) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 0 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with torch.cuda.amp.autocast(amp): pred = model(imgs) # forward loss, loss_items = compute_loss_ota(pred, targets.to(device), imgs) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html if ni - last_opt_step >= accumulate: scaler.unscale_(optimizer) # unscale gradients torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0) # clip gradients scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in {-1, 0}: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%11s' * 2 + '%11.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', model, ni, imgs, targets, paths, list(mloss)) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in {-1, 0}: # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights']) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = validate.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, half=amp, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] stop = stopper(epoch=epoch, fitness=fi) # early stop check if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'opt': vars(opt), 'git': GIT_INFO, # {remote, branch, commit} if a git repo 'date': datetime.now().isoformat()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if opt.save_period > 0 and epoch % opt.save_period == 0: torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # EarlyStopping if RANK != -1: # if DDP training broadcast_list = [stop if RANK == 0 else None] dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks if RANK != 0: stop = broadcast_list[0] if stop: break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in {-1, 0}: LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.') for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = validate.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools at iou 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=plots, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, epoch, results) torch.cuda.empty_cache() return results def parse_opt(known=False): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default=ROOT / 'yolov5n.pt', help='initial weights path') parser.add_argument('--cfg', type=str, default='models/yolov5_aux.yaml', help='model.yaml path') parser.add_argument('--data', type=str, default=ROOT / '/home/hjj/Desktop/dataset/data.yaml', help='dataset.yaml path') parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch-low.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=100, help='total training epochs') parser.add_argument('--batch-size', type=int, default=64, help='total batch size for all GPUs, -1 for autobatch') parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') parser.add_argument('--noval', action='store_true', help='only validate final epoch') parser.add_argument('--noautoanchor', action='store_true', help='disable AutoAnchor') parser.add_argument('--noplots', action='store_true', help='save no plot files') parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations') parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') parser.add_argument('--cache', type=str, nargs='?', const='ram', default=True, help='image --cache ram/disk') parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'AdamW'], default='SGD', help='optimizer') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--workers', type=int, default=4, help='max dataloader workers (per RANK in DDP mode)') parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name') parser.add_argument('--name', default='exp', help='save to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--quad', action='store_true', help='quad dataloader') parser.add_argument('--cos-lr', action='store_true', help='cosine LR scheduler') parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon') parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)') parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2') parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)') parser.add_argument('--seed', type=int, default=0, help='Global training seed') parser.add_argument('--local_rank', type=int, default=-1, help='Automatic DDP Multi-GPU argument, do not modify') # Logger arguments parser.add_argument('--entity', default=None, help='Entity') parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='Upload data, "val" option') parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval') parser.add_argument('--artifact_alias', type=str, default='latest', help='Version of dataset artifact to use') return parser.parse_known_args()[0] if known else parser.parse_args() def main(opt, callbacks=Callbacks()): # Checks if RANK in {-1, 0}: print_args(vars(opt)) check_git_status() check_requirements() # Resume (from specified or most recent last.pt) if opt.resume and not check_comet_resume(opt) and not opt.evolve: last = Path(check_file(opt.resume) if isinstance(opt.resume, str) else get_latest_run()) opt_yaml = last.parent.parent / 'opt.yaml' # train options yaml opt_data = opt.data # original dataset if opt_yaml.is_file(): with open(opt_yaml, errors='ignore') as f: d = yaml.safe_load(f) else: d = torch.load(last, map_location='cpu')['opt'] opt = argparse.Namespace(**d) # replace opt.cfg, opt.weights, opt.resume = '', str(last), True # reinstate if is_url(opt_data): opt.data = check_file(opt_data) # avoid HUB resume auth timeout else: opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \ check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' if opt.evolve: if opt.project == str(ROOT / 'runs/train'): # if default project name, rename to runs/evolve opt.project = str(ROOT / 'runs/evolve') opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume if opt.name == 'cfg': opt.name = Path(opt.cfg).stem # use model.yaml as name opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: msg = 'is not compatible with YOLOv5 Multi-GPU DDP training' assert not opt.image_weights, f'--image-weights {msg}' assert not opt.evolve, f'--evolve {msg}' assert opt.batch_size != -1, f'AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size' assert opt.batch_size % WORLD_SIZE == 0, f'--batch-size {opt.batch_size} must be multiple of WORLD_SIZE' assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group(backend='nccl' if dist.is_nccl_available() else 'gloo') # Train if not opt.evolve: train(opt.hyp, opt, device, callbacks) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0), # image mixup (probability) 'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability) with open(opt.hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict if 'anchors' not in hyp: # anchors commented in hyp.yaml hyp['anchors'] = 3 if opt.noautoanchor: del hyp['anchors'], meta['anchors'] opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv' if opt.bucket: # download evolve.csv if exists subprocess.run([ 'gsutil', 'cp', f'gs://{opt.bucket}/evolve.csv', str(evolve_csv),]) for _ in range(opt.evolve): # generations to evolve if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all(v == 1): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device, callbacks) callbacks = Callbacks() # Write mutation results keys = ('metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss') print_mutation(keys, results, hyp.copy(), save_dir, opt.bucket) # Plot results plot_evolve(evolve_csv) LOGGER.info(f'Hyperparameter evolution finished {opt.evolve} generations\n' f"Results saved to {colorstr('bold', save_dir)}\n" f'Usage example: $ python train.py --hyp {evolve_yaml}') def run(**kwargs): # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt') opt = parse_opt(True) for k, v in kwargs.items(): setattr(opt, k, v) main(opt) return opt if __name__ == '__main__': opt = parse_opt() main(opt) ================================================ FILE: yolo-improve/yolov5-AUX/utils/__init__.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ utils/initialization """ import contextlib import platform import threading def emojis(str=''): # Return platform-dependent emoji-safe version of string return str.encode().decode('ascii', 'ignore') if platform.system() == 'Windows' else str class TryExcept(contextlib.ContextDecorator): # YOLOv5 TryExcept class. Usage: @TryExcept() decorator or 'with TryExcept():' context manager def __init__(self, msg=''): self.msg = msg def __enter__(self): pass def __exit__(self, exc_type, value, traceback): if value: print(emojis(f"{self.msg}{': ' if self.msg else ''}{value}")) return True def threaded(func): # Multi-threads a target function and returns thread. Usage: @threaded decorator def wrapper(*args, **kwargs): thread = threading.Thread(target=func, args=args, kwargs=kwargs, daemon=True) thread.start() return thread return wrapper def join_threads(verbose=False): # Join all daemon threads, i.e. atexit.register(lambda: join_threads()) main_thread = threading.current_thread() for t in threading.enumerate(): if t is not main_thread: if verbose: print(f'Joining thread {t.name}') t.join() def notebook_init(verbose=True): # Check system software and hardware print('Checking setup...') import os import shutil from utils.general import check_font, check_requirements, is_colab from utils.torch_utils import select_device # imports check_font() import psutil from IPython import display # to display images and clear console output if is_colab(): shutil.rmtree('/content/sample_data', ignore_errors=True) # remove colab /sample_data directory # System info if verbose: gb = 1 << 30 # bytes to GiB (1024 ** 3) ram = psutil.virtual_memory().total total, used, free = shutil.disk_usage('/') display.clear_output() s = f'({os.cpu_count()} CPUs, {ram / gb:.1f} GB RAM, {(total - free) / gb:.1f}/{total / gb:.1f} GB disk)' else: s = '' select_device(newline=False) print(emojis(f'Setup complete ✅ {s}')) return display ================================================ FILE: yolo-improve/yolov5-AUX/utils/activations.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Activation functions """ import torch import torch.nn as nn import torch.nn.functional as F class SiLU(nn.Module): # SiLU activation https://arxiv.org/pdf/1606.08415.pdf @staticmethod def forward(x): return x * torch.sigmoid(x) class Hardswish(nn.Module): # Hard-SiLU activation @staticmethod def forward(x): # return x * F.hardsigmoid(x) # for TorchScript and CoreML return x * F.hardtanh(x + 3, 0.0, 6.0) / 6.0 # for TorchScript, CoreML and ONNX class Mish(nn.Module): # Mish activation https://github.com/digantamisra98/Mish @staticmethod def forward(x): return x * F.softplus(x).tanh() class MemoryEfficientMish(nn.Module): # Mish activation memory-efficient class F(torch.autograd.Function): @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x))) @staticmethod def backward(ctx, grad_output): x = ctx.saved_tensors[0] sx = torch.sigmoid(x) fx = F.softplus(x).tanh() return grad_output * (fx + x * sx * (1 - fx * fx)) def forward(self, x): return self.F.apply(x) class FReLU(nn.Module): # FReLU activation https://arxiv.org/abs/2007.11824 def __init__(self, c1, k=3): # ch_in, kernel super().__init__() self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False) self.bn = nn.BatchNorm2d(c1) def forward(self, x): return torch.max(x, self.bn(self.conv(x))) class AconC(nn.Module): r""" ACON activation (activate or not) AconC: (p1*x-p2*x) * sigmoid(beta*(p1*x-p2*x)) + p2*x, beta is a learnable parameter according to "Activate or Not: Learning Customized Activation" . """ def __init__(self, c1): super().__init__() self.p1 = nn.Parameter(torch.randn(1, c1, 1, 1)) self.p2 = nn.Parameter(torch.randn(1, c1, 1, 1)) self.beta = nn.Parameter(torch.ones(1, c1, 1, 1)) def forward(self, x): dpx = (self.p1 - self.p2) * x return dpx * torch.sigmoid(self.beta * dpx) + self.p2 * x class MetaAconC(nn.Module): r""" ACON activation (activate or not) MetaAconC: (p1*x-p2*x) * sigmoid(beta*(p1*x-p2*x)) + p2*x, beta is generated by a small network according to "Activate or Not: Learning Customized Activation" . """ def __init__(self, c1, k=1, s=1, r=16): # ch_in, kernel, stride, r super().__init__() c2 = max(r, c1 // r) self.p1 = nn.Parameter(torch.randn(1, c1, 1, 1)) self.p2 = nn.Parameter(torch.randn(1, c1, 1, 1)) self.fc1 = nn.Conv2d(c1, c2, k, s, bias=True) self.fc2 = nn.Conv2d(c2, c1, k, s, bias=True) # self.bn1 = nn.BatchNorm2d(c2) # self.bn2 = nn.BatchNorm2d(c1) def forward(self, x): y = x.mean(dim=2, keepdims=True).mean(dim=3, keepdims=True) # batch-size 1 bug/instabilities https://github.com/ultralytics/yolov5/issues/2891 # beta = torch.sigmoid(self.bn2(self.fc2(self.bn1(self.fc1(y))))) # bug/unstable beta = torch.sigmoid(self.fc2(self.fc1(y))) # bug patch BN layers removed dpx = (self.p1 - self.p2) * x return dpx * torch.sigmoid(beta * dpx) + self.p2 * x ================================================ FILE: yolo-improve/yolov5-AUX/utils/augmentations.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Image augmentation functions """ import math import random import cv2 import numpy as np import torch import torchvision.transforms as T import torchvision.transforms.functional as TF from utils.general import LOGGER, check_version, colorstr, resample_segments, segment2box, xywhn2xyxy from utils.metrics import bbox_ioa IMAGENET_MEAN = 0.485, 0.456, 0.406 # RGB mean IMAGENET_STD = 0.229, 0.224, 0.225 # RGB standard deviation class Albumentations: # YOLOv5 Albumentations class (optional, only used if package is installed) def __init__(self, size=640): self.transform = None prefix = colorstr('albumentations: ') try: import albumentations as A check_version(A.__version__, '1.0.3', hard=True) # version requirement T = [ A.RandomResizedCrop(height=size, width=size, scale=(0.8, 1.0), ratio=(0.9, 1.11), p=0.0), A.Blur(p=0.01), A.MedianBlur(p=0.01), A.ToGray(p=0.01), A.CLAHE(p=0.01), A.RandomBrightnessContrast(p=0.0), A.RandomGamma(p=0.0), A.ImageCompression(quality_lower=75, p=0.0)] # transforms self.transform = A.Compose(T, bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels'])) LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p)) except ImportError: # package not installed, skip pass except Exception as e: LOGGER.info(f'{prefix}{e}') def __call__(self, im, labels, p=1.0): if self.transform and random.random() < p: new = self.transform(image=im, bboxes=labels[:, 1:], class_labels=labels[:, 0]) # transformed im, labels = new['image'], np.array([[c, *b] for c, b in zip(new['class_labels'], new['bboxes'])]) return im, labels def normalize(x, mean=IMAGENET_MEAN, std=IMAGENET_STD, inplace=False): # Denormalize RGB images x per ImageNet stats in BCHW format, i.e. = (x - mean) / std return TF.normalize(x, mean, std, inplace=inplace) def denormalize(x, mean=IMAGENET_MEAN, std=IMAGENET_STD): # Denormalize RGB images x per ImageNet stats in BCHW format, i.e. = x * std + mean for i in range(3): x[:, i] = x[:, i] * std[i] + mean[i] return x def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5): # HSV color-space augmentation if hgain or sgain or vgain: r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains hue, sat, val = cv2.split(cv2.cvtColor(im, cv2.COLOR_BGR2HSV)) dtype = im.dtype # uint8 x = np.arange(0, 256, dtype=r.dtype) lut_hue = ((x * r[0]) % 180).astype(dtype) lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) lut_val = np.clip(x * r[2], 0, 255).astype(dtype) im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))) cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im) # no return needed def hist_equalize(im, clahe=True, bgr=False): # Equalize histogram on BGR image 'im' with im.shape(n,m,3) and range 0-255 yuv = cv2.cvtColor(im, cv2.COLOR_BGR2YUV if bgr else cv2.COLOR_RGB2YUV) if clahe: c = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) yuv[:, :, 0] = c.apply(yuv[:, :, 0]) else: yuv[:, :, 0] = cv2.equalizeHist(yuv[:, :, 0]) # equalize Y channel histogram return cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR if bgr else cv2.COLOR_YUV2RGB) # convert YUV image to RGB def replicate(im, labels): # Replicate labels h, w = im.shape[:2] boxes = labels[:, 1:].astype(int) x1, y1, x2, y2 = boxes.T s = ((x2 - x1) + (y2 - y1)) / 2 # side length (pixels) for i in s.argsort()[:round(s.size * 0.5)]: # smallest indices x1b, y1b, x2b, y2b = boxes[i] bh, bw = y2b - y1b, x2b - x1b yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw)) # offset x, y x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh] im[y1a:y2a, x1a:x2a] = im[y1b:y2b, x1b:x2b] # im4[ymin:ymax, xmin:xmax] labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0) return im, labels def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32): # Resize and pad image while meeting stride-multiple constraints shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if auto: # minimum rectangle dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border return im, ratio, (dw, dh) def random_perspective(im, targets=(), segments=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)): # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(0.1, 0.1), scale=(0.9, 1.1), shear=(-10, 10)) # targets = [cls, xyxy] height = im.shape[0] + border[0] * 2 # shape(h,w,c) width = im.shape[1] + border[1] * 2 # Center C = np.eye(3) C[0, 2] = -im.shape[1] / 2 # x translation (pixels) C[1, 2] = -im.shape[0] / 2 # y translation (pixels) # Perspective P = np.eye(3) P[2, 0] = random.uniform(-perspective, perspective) # x perspective (about y) P[2, 1] = random.uniform(-perspective, perspective) # y perspective (about x) # Rotation and Scale R = np.eye(3) a = random.uniform(-degrees, degrees) # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations s = random.uniform(1 - scale, 1 + scale) # s = 2 ** random.uniform(-scale, scale) R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) # Shear S = np.eye(3) S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) # Translation T = np.eye(3) T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width # x translation (pixels) T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height # y translation (pixels) # Combined rotation matrix M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed if perspective: im = cv2.warpPerspective(im, M, dsize=(width, height), borderValue=(114, 114, 114)) else: # affine im = cv2.warpAffine(im, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) # Visualize # import matplotlib.pyplot as plt # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel() # ax[0].imshow(im[:, :, ::-1]) # base # ax[1].imshow(im2[:, :, ::-1]) # warped # Transform label coordinates n = len(targets) if n: use_segments = any(x.any() for x in segments) and len(segments) == n new = np.zeros((n, 4)) if use_segments: # warp segments segments = resample_segments(segments) # upsample for i, segment in enumerate(segments): xy = np.ones((len(segment), 3)) xy[:, :2] = segment xy = xy @ M.T # transform xy = xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2] # perspective rescale or affine # clip new[i] = segment2box(xy, width, height) else: # warp boxes xy = np.ones((n * 4, 3)) xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 xy = xy @ M.T # transform xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2]).reshape(n, 8) # perspective rescale or affine # create new boxes x = xy[:, [0, 2, 4, 6]] y = xy[:, [1, 3, 5, 7]] new = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T # clip new[:, [0, 2]] = new[:, [0, 2]].clip(0, width) new[:, [1, 3]] = new[:, [1, 3]].clip(0, height) # filter candidates i = box_candidates(box1=targets[:, 1:5].T * s, box2=new.T, area_thr=0.01 if use_segments else 0.10) targets = targets[i] targets[:, 1:5] = new[i] return im, targets def copy_paste(im, labels, segments, p=0.5): # Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy) n = len(segments) if p and n: h, w, c = im.shape # height, width, channels im_new = np.zeros(im.shape, np.uint8) for j in random.sample(range(n), k=round(p * n)): l, s = labels[j], segments[j] box = w - l[3], l[2], w - l[1], l[4] ioa = bbox_ioa(box, labels[:, 1:5]) # intersection over area if (ioa < 0.30).all(): # allow 30% obscuration of existing labels labels = np.concatenate((labels, [[l[0], *box]]), 0) segments.append(np.concatenate((w - s[:, 0:1], s[:, 1:2]), 1)) cv2.drawContours(im_new, [segments[j].astype(np.int32)], -1, (1, 1, 1), cv2.FILLED) result = cv2.flip(im, 1) # augment segments (flip left-right) i = cv2.flip(im_new, 1).astype(bool) im[i] = result[i] # cv2.imwrite('debug.jpg', im) # debug return im, labels, segments def cutout(im, labels, p=0.5): # Applies image cutout augmentation https://arxiv.org/abs/1708.04552 if random.random() < p: h, w = im.shape[:2] scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16 # image size fraction for s in scales: mask_h = random.randint(1, int(h * s)) # create random masks mask_w = random.randint(1, int(w * s)) # box xmin = max(0, random.randint(0, w) - mask_w // 2) ymin = max(0, random.randint(0, h) - mask_h // 2) xmax = min(w, xmin + mask_w) ymax = min(h, ymin + mask_h) # apply random color mask im[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)] # return unobscured labels if len(labels) and s > 0.03: box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32) ioa = bbox_ioa(box, xywhn2xyxy(labels[:, 1:5], w, h)) # intersection over area labels = labels[ioa < 0.60] # remove >60% obscured labels return labels def mixup(im, labels, im2, labels2): # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 im = (im * r + im2 * (1 - r)).astype(np.uint8) labels = np.concatenate((labels, labels2), 0) return im, labels def box_candidates(box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16): # box1(4,n), box2(4,n) # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio w1, h1 = box1[2] - box1[0], box1[3] - box1[1] w2, h2 = box2[2] - box2[0], box2[3] - box2[1] ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps)) # aspect ratio return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr) # candidates def classify_albumentations( augment=True, size=224, scale=(0.08, 1.0), ratio=(0.75, 1.0 / 0.75), # 0.75, 1.33 hflip=0.5, vflip=0.0, jitter=0.4, mean=IMAGENET_MEAN, std=IMAGENET_STD, auto_aug=False): # YOLOv5 classification Albumentations (optional, only used if package is installed) prefix = colorstr('albumentations: ') try: import albumentations as A from albumentations.pytorch import ToTensorV2 check_version(A.__version__, '1.0.3', hard=True) # version requirement if augment: # Resize and crop T = [A.RandomResizedCrop(height=size, width=size, scale=scale, ratio=ratio)] if auto_aug: # TODO: implement AugMix, AutoAug & RandAug in albumentation LOGGER.info(f'{prefix}auto augmentations are currently not supported') else: if hflip > 0: T += [A.HorizontalFlip(p=hflip)] if vflip > 0: T += [A.VerticalFlip(p=vflip)] if jitter > 0: color_jitter = (float(jitter),) * 3 # repeat value for brightness, contrast, satuaration, 0 hue T += [A.ColorJitter(*color_jitter, 0)] else: # Use fixed crop for eval set (reproducibility) T = [A.SmallestMaxSize(max_size=size), A.CenterCrop(height=size, width=size)] T += [A.Normalize(mean=mean, std=std), ToTensorV2()] # Normalize and convert to Tensor LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p)) return A.Compose(T) except ImportError: # package not installed, skip LOGGER.warning(f'{prefix}⚠️ not found, install with `pip install albumentations` (recommended)') except Exception as e: LOGGER.info(f'{prefix}{e}') def classify_transforms(size=224): # Transforms to apply if albumentations not installed assert isinstance(size, int), f'ERROR: classify_transforms size {size} must be integer, not (list, tuple)' # T.Compose([T.ToTensor(), T.Resize(size), T.CenterCrop(size), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) class LetterBox: # YOLOv5 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()]) def __init__(self, size=(640, 640), auto=False, stride=32): super().__init__() self.h, self.w = (size, size) if isinstance(size, int) else size self.auto = auto # pass max size integer, automatically solve for short side using stride self.stride = stride # used with auto def __call__(self, im): # im = np.array HWC imh, imw = im.shape[:2] r = min(self.h / imh, self.w / imw) # ratio of new/old h, w = round(imh * r), round(imw * r) # resized image hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else self.h, self.w top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1) im_out = np.full((self.h, self.w, 3), 114, dtype=im.dtype) im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR) return im_out class CenterCrop: # YOLOv5 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()]) def __init__(self, size=640): super().__init__() self.h, self.w = (size, size) if isinstance(size, int) else size def __call__(self, im): # im = np.array HWC imh, imw = im.shape[:2] m = min(imh, imw) # min dimension top, left = (imh - m) // 2, (imw - m) // 2 return cv2.resize(im[top:top + m, left:left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR) class ToTensor: # YOLOv5 ToTensor class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()]) def __init__(self, half=False): super().__init__() self.half = half def __call__(self, im): # im = np.array HWC in BGR order im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1]) # HWC to CHW -> BGR to RGB -> contiguous im = torch.from_numpy(im) # to torch im = im.half() if self.half else im.float() # uint8 to fp16/32 im /= 255.0 # 0-255 to 0.0-1.0 return im ================================================ FILE: yolo-improve/yolov5-AUX/utils/autoanchor.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ AutoAnchor utils """ import random import numpy as np import torch import yaml from tqdm import tqdm from utils import TryExcept from utils.general import LOGGER, TQDM_BAR_FORMAT, colorstr PREFIX = colorstr('AutoAnchor: ') def check_anchor_order(m): # Check anchor order against stride order for YOLOv5 Detect() module m, and correct if necessary a = m.anchors.prod(-1).mean(-1).view(-1) # mean anchor area per output layer da = a[-1] - a[0] # delta a ds = m.stride[-1] - m.stride[0] # delta s if da and (da.sign() != ds.sign()): # same order LOGGER.info(f'{PREFIX}Reversing anchor order') m.anchors[:] = m.anchors.flip(0) @TryExcept(f'{PREFIX}ERROR') def check_anchors(dataset, model, thr=4.0, imgsz=640): # Check anchor fit to data, recompute if necessary m = model.module.model[-1] if hasattr(model, 'module') else model.model[-1] # Detect() shapes = imgsz * dataset.shapes / dataset.shapes.max(1, keepdims=True) scale = np.random.uniform(0.9, 1.1, size=(shapes.shape[0], 1)) # augment scale wh = torch.tensor(np.concatenate([l[:, 3:5] * s for s, l in zip(shapes * scale, dataset.labels)])).float() # wh def metric(k): # compute metric r = wh[:, None] / k[None] x = torch.min(r, 1 / r).min(2)[0] # ratio metric best = x.max(1)[0] # best_x aat = (x > 1 / thr).float().sum(1).mean() # anchors above threshold bpr = (best > 1 / thr).float().mean() # best possible recall return bpr, aat stride = m.stride.to(m.anchors.device).view(-1, 1, 1) # model strides anchors = m.anchors.clone() * stride # current anchors bpr, aat = metric(anchors.cpu().view(-1, 2)) s = f'\n{PREFIX}{aat:.2f} anchors/target, {bpr:.3f} Best Possible Recall (BPR). ' if bpr > 0.98: # threshold to recompute LOGGER.info(f'{s}Current anchors are a good fit to dataset ✅') else: LOGGER.info(f'{s}Anchors are a poor fit to dataset ⚠️, attempting to improve...') na = m.anchors.numel() // 2 # number of anchors anchors = kmean_anchors(dataset, n=na, img_size=imgsz, thr=thr, gen=1000, verbose=False) new_bpr = metric(anchors)[0] if new_bpr > bpr: # replace anchors anchors = torch.tensor(anchors, device=m.anchors.device).type_as(m.anchors) m.anchors[:] = anchors.clone().view_as(m.anchors) check_anchor_order(m) # must be in pixel-space (not grid-space) m.anchors /= stride s = f'{PREFIX}Done ✅ (optional: update model *.yaml to use these anchors in the future)' else: s = f'{PREFIX}Done ⚠️ (original anchors better than new anchors, proceeding with original anchors)' LOGGER.info(s) def kmean_anchors(dataset='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True): """ Creates kmeans-evolved anchors from training dataset Arguments: dataset: path to data.yaml, or a loaded dataset n: number of anchors img_size: image size used for training thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0 gen: generations to evolve anchors using genetic algorithm verbose: print all results Return: k: kmeans evolved anchors Usage: from utils.autoanchor import *; _ = kmean_anchors() """ from scipy.cluster.vq import kmeans npr = np.random thr = 1 / thr def metric(k, wh): # compute metrics r = wh[:, None] / k[None] x = torch.min(r, 1 / r).min(2)[0] # ratio metric # x = wh_iou(wh, torch.tensor(k)) # iou metric return x, x.max(1)[0] # x, best_x def anchor_fitness(k): # mutation fitness _, best = metric(torch.tensor(k, dtype=torch.float32), wh) return (best * (best > thr).float()).mean() # fitness def print_results(k, verbose=True): k = k[np.argsort(k.prod(1))] # sort small to large x, best = metric(k, wh0) bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n # best possible recall, anch > thr s = f'{PREFIX}thr={thr:.2f}: {bpr:.4f} best possible recall, {aat:.2f} anchors past thr\n' \ f'{PREFIX}n={n}, img_size={img_size}, metric_all={x.mean():.3f}/{best.mean():.3f}-mean/best, ' \ f'past_thr={x[x > thr].mean():.3f}-mean: ' for x in k: s += '%i,%i, ' % (round(x[0]), round(x[1])) if verbose: LOGGER.info(s[:-2]) return k if isinstance(dataset, str): # *.yaml file with open(dataset, errors='ignore') as f: data_dict = yaml.safe_load(f) # model dict from utils.dataloaders import LoadImagesAndLabels dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True) # Get label wh shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True) wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)]) # wh # Filter i = (wh0 < 3.0).any(1).sum() if i: LOGGER.info(f'{PREFIX}WARNING ⚠️ Extremely small objects found: {i} of {len(wh0)} labels are <3 pixels in size') wh = wh0[(wh0 >= 2.0).any(1)].astype(np.float32) # filter > 2 pixels # wh = wh * (npr.rand(wh.shape[0], 1) * 0.9 + 0.1) # multiply by random scale 0-1 # Kmeans init try: LOGGER.info(f'{PREFIX}Running kmeans for {n} anchors on {len(wh)} points...') assert n <= len(wh) # apply overdetermined constraint s = wh.std(0) # sigmas for whitening k = kmeans(wh / s, n, iter=30)[0] * s # points assert n == len(k) # kmeans may return fewer points than requested if wh is insufficient or too similar except Exception: LOGGER.warning(f'{PREFIX}WARNING ⚠️ switching strategies from kmeans to random init') k = np.sort(npr.rand(n * 2)).reshape(n, 2) * img_size # random init wh, wh0 = (torch.tensor(x, dtype=torch.float32) for x in (wh, wh0)) k = print_results(k, verbose=False) # Plot # k, d = [None] * 20, [None] * 20 # for i in tqdm(range(1, 21)): # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True) # ax = ax.ravel() # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.') # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # plot wh # ax[0].hist(wh[wh[:, 0]<100, 0],400) # ax[1].hist(wh[wh[:, 1]<100, 1],400) # fig.savefig('wh.png', dpi=200) # Evolve f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma pbar = tqdm(range(gen), bar_format=TQDM_BAR_FORMAT) # progress bar for _ in pbar: v = np.ones(sh) while (v == 1).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = anchor_fitness(kg) if fg > f: f, k = fg, kg.copy() pbar.desc = f'{PREFIX}Evolving anchors with Genetic Algorithm: fitness = {f:.4f}' if verbose: print_results(k, verbose) return print_results(k).astype(np.float32) ================================================ FILE: yolo-improve/yolov5-AUX/utils/autobatch.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Auto-batch utils """ from copy import deepcopy import numpy as np import torch from utils.general import LOGGER, colorstr from utils.torch_utils import profile def check_train_batch_size(model, imgsz=640, amp=True): # Check YOLOv5 training batch size with torch.cuda.amp.autocast(amp): return autobatch(deepcopy(model).train(), imgsz) # compute optimal batch size def autobatch(model, imgsz=640, fraction=0.8, batch_size=16): # Automatically estimate best YOLOv5 batch size to use `fraction` of available CUDA memory # Usage: # import torch # from utils.autobatch import autobatch # model = torch.hub.load('ultralytics/yolov5', 'yolov5s', autoshape=False) # print(autobatch(model)) # Check device prefix = colorstr('AutoBatch: ') LOGGER.info(f'{prefix}Computing optimal batch size for --imgsz {imgsz}') device = next(model.parameters()).device # get model device if device.type == 'cpu': LOGGER.info(f'{prefix}CUDA not detected, using default CPU batch-size {batch_size}') return batch_size if torch.backends.cudnn.benchmark: LOGGER.info(f'{prefix} ⚠️ Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}') return batch_size # Inspect CUDA memory gb = 1 << 30 # bytes to GiB (1024 ** 3) d = str(device).upper() # 'CUDA:0' properties = torch.cuda.get_device_properties(device) # device properties t = properties.total_memory / gb # GiB total r = torch.cuda.memory_reserved(device) / gb # GiB reserved a = torch.cuda.memory_allocated(device) / gb # GiB allocated f = t - (r + a) # GiB free LOGGER.info(f'{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free') # Profile batch sizes batch_sizes = [1, 2, 4, 8, 16] try: img = [torch.empty(b, 3, imgsz, imgsz) for b in batch_sizes] results = profile(img, model, n=3, device=device) except Exception as e: LOGGER.warning(f'{prefix}{e}') # Fit a solution y = [x[2] for x in results if x] # memory [2] p = np.polyfit(batch_sizes[:len(y)], y, deg=1) # first degree polynomial fit b = int((f * fraction - p[1]) / p[0]) # y intercept (optimal batch size) if None in results: # some sizes failed i = results.index(None) # first fail index if b >= batch_sizes[i]: # y intercept above failure point b = batch_sizes[max(i - 1, 0)] # select prior safe point if b < 1 or b > 1024: # b outside of safe range b = batch_size LOGGER.warning(f'{prefix}WARNING ⚠️ CUDA anomaly detected, recommend restart environment and retry command.') fraction = (np.polyval(p, b) + r + a) / t # actual fraction predicted LOGGER.info(f'{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅') return b ================================================ FILE: yolo-improve/yolov5-AUX/utils/aws/__init__.py ================================================ ================================================ FILE: yolo-improve/yolov5-AUX/utils/aws/mime.sh ================================================ # AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/ # This script will run on every instance restart, not only on first start # --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA --- Content-Type: multipart/mixed; boundary="//" MIME-Version: 1.0 --// Content-Type: text/cloud-config; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="cloud-config.txt" #cloud-config cloud_final_modules: - [scripts-user, always] --// Content-Type: text/x-shellscript; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="userdata.txt" #!/bin/bash # --- paste contents of userdata.sh here --- --// ================================================ FILE: yolo-improve/yolov5-AUX/utils/aws/resume.py ================================================ # Resume all interrupted trainings in yolov5/ dir including DDP trainings # Usage: $ python utils/aws/resume.py import os import sys from pathlib import Path import torch import yaml FILE = Path(__file__).resolve() ROOT = FILE.parents[2] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH port = 0 # --master_port path = Path('').resolve() for last in path.rglob('*/**/last.pt'): ckpt = torch.load(last) if ckpt['optimizer'] is None: continue # Load opt.yaml with open(last.parent.parent / 'opt.yaml', errors='ignore') as f: opt = yaml.safe_load(f) # Get device count d = opt['device'].split(',') # devices nd = len(d) # number of devices ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel if ddp: # multi-GPU port += 1 cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}' else: # single-GPU cmd = f'python train.py --resume {last}' cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread print(cmd) os.system(cmd) ================================================ FILE: yolo-improve/yolov5-AUX/utils/aws/userdata.sh ================================================ #!/bin/bash # AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html # This script will run only once on first instance start (for a re-start script see mime.sh) # /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir # Use >300 GB SSD cd home/ubuntu if [ ! -d yolov5 ]; then echo "Running first-time script." # install dependencies, download COCO, pull Docker git clone https://github.com/ultralytics/yolov5 -b master && sudo chmod -R 777 yolov5 cd yolov5 bash data/scripts/get_coco.sh && echo "COCO done." & sudo docker pull ultralytics/yolov5:latest && echo "Docker done." & python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." & wait && echo "All tasks done." # finish background tasks else echo "Running re-start script." # resume interrupted runs i=0 list=$(sudo docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour' while IFS= read -r id; do ((i++)) echo "restarting container $i: $id" sudo docker start $id # sudo docker exec -it $id python train.py --resume # single-GPU sudo docker exec -d $id python utils/aws/resume.py # multi-scenario done <<<"$list" fi ================================================ FILE: yolo-improve/yolov5-AUX/utils/callbacks.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Callback utils """ import threading class Callbacks: """" Handles all registered callbacks for YOLOv5 Hooks """ def __init__(self): # Define the available callbacks self._callbacks = { 'on_pretrain_routine_start': [], 'on_pretrain_routine_end': [], 'on_train_start': [], 'on_train_epoch_start': [], 'on_train_batch_start': [], 'optimizer_step': [], 'on_before_zero_grad': [], 'on_train_batch_end': [], 'on_train_epoch_end': [], 'on_val_start': [], 'on_val_batch_start': [], 'on_val_image_end': [], 'on_val_batch_end': [], 'on_val_end': [], 'on_fit_epoch_end': [], # fit = train + val 'on_model_save': [], 'on_train_end': [], 'on_params_update': [], 'teardown': [],} self.stop_training = False # set True to interrupt training def register_action(self, hook, name='', callback=None): """ Register a new action to a callback hook Args: hook: The callback hook name to register the action to name: The name of the action for later reference callback: The callback to fire """ assert hook in self._callbacks, f"hook '{hook}' not found in callbacks {self._callbacks}" assert callable(callback), f"callback '{callback}' is not callable" self._callbacks[hook].append({'name': name, 'callback': callback}) def get_registered_actions(self, hook=None): """" Returns all the registered actions by callback hook Args: hook: The name of the hook to check, defaults to all """ return self._callbacks[hook] if hook else self._callbacks def run(self, hook, *args, thread=False, **kwargs): """ Loop through the registered actions and fire all callbacks on main thread Args: hook: The name of the hook to check, defaults to all args: Arguments to receive from YOLOv5 thread: (boolean) Run callbacks in daemon thread kwargs: Keyword Arguments to receive from YOLOv5 """ assert hook in self._callbacks, f"hook '{hook}' not found in callbacks {self._callbacks}" for logger in self._callbacks[hook]: if thread: threading.Thread(target=logger['callback'], args=args, kwargs=kwargs, daemon=True).start() else: logger['callback'](*args, **kwargs) ================================================ FILE: yolo-improve/yolov5-AUX/utils/dataloaders.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Dataloaders and dataset utils """ import contextlib import glob import hashlib import json import math import os import random import shutil import time from itertools import repeat from multiprocessing.pool import Pool, ThreadPool from pathlib import Path from threading import Thread from urllib.parse import urlparse import numpy as np import psutil import torch import torch.nn.functional as F import torchvision import yaml from PIL import ExifTags, Image, ImageOps from torch.utils.data import DataLoader, Dataset, dataloader, distributed from tqdm import tqdm from utils.augmentations import (Albumentations, augment_hsv, classify_albumentations, classify_transforms, copy_paste, letterbox, mixup, random_perspective) from utils.general import (DATASETS_DIR, LOGGER, NUM_THREADS, TQDM_BAR_FORMAT, check_dataset, check_requirements, check_yaml, clean_str, cv2, is_colab, is_kaggle, segments2boxes, unzip_file, xyn2xy, xywh2xyxy, xywhn2xyxy, xyxy2xywhn) from utils.torch_utils import torch_distributed_zero_first # Parameters HELP_URL = 'See https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data' IMG_FORMATS = 'bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp', 'pfm' # include image suffixes VID_FORMATS = 'asf', 'avi', 'gif', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'ts', 'wmv' # include video suffixes LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) PIN_MEMORY = str(os.getenv('PIN_MEMORY', True)).lower() == 'true' # global pin_memory for dataloaders # Get orientation exif tag for orientation in ExifTags.TAGS.keys(): if ExifTags.TAGS[orientation] == 'Orientation': break def get_hash(paths): # Returns a single hash value of a list of paths (files or dirs) size = sum(os.path.getsize(p) for p in paths if os.path.exists(p)) # sizes h = hashlib.sha256(str(size).encode()) # hash sizes h.update(''.join(paths).encode()) # hash paths return h.hexdigest() # return hash def exif_size(img): # Returns exif-corrected PIL size s = img.size # (width, height) with contextlib.suppress(Exception): rotation = dict(img._getexif().items())[orientation] if rotation in [6, 8]: # rotation 270 or 90 s = (s[1], s[0]) return s def exif_transpose(image): """ Transpose a PIL image accordingly if it has an EXIF Orientation tag. Inplace version of https://github.com/python-pillow/Pillow/blob/master/src/PIL/ImageOps.py exif_transpose() :param image: The image to transpose. :return: An image. """ exif = image.getexif() orientation = exif.get(0x0112, 1) # default 1 if orientation > 1: method = { 2: Image.FLIP_LEFT_RIGHT, 3: Image.ROTATE_180, 4: Image.FLIP_TOP_BOTTOM, 5: Image.TRANSPOSE, 6: Image.ROTATE_270, 7: Image.TRANSVERSE, 8: Image.ROTATE_90}.get(orientation) if method is not None: image = image.transpose(method) del exif[0x0112] image.info['exif'] = exif.tobytes() return image def seed_worker(worker_id): # Set dataloader worker seed https://pytorch.org/docs/stable/notes/randomness.html#dataloader worker_seed = torch.initial_seed() % 2 ** 32 np.random.seed(worker_seed) random.seed(worker_seed) def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=None, augment=False, cache=False, pad=0.0, rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix='', shuffle=False, seed=0): if rect and shuffle: LOGGER.warning('WARNING ⚠️ --rect is incompatible with DataLoader shuffle, setting shuffle=False') shuffle = False with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP dataset = LoadImagesAndLabels( path, imgsz, batch_size, augment=augment, # augmentation hyp=hyp, # hyperparameters rect=rect, # rectangular batches cache_images=cache, single_cls=single_cls, stride=int(stride), pad=pad, image_weights=image_weights, prefix=prefix) batch_size = min(batch_size, len(dataset)) nd = torch.cuda.device_count() # number of CUDA devices nw = min([os.cpu_count() // max(nd, 1), batch_size if batch_size > 1 else 0, workers]) # number of workers sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle) loader = DataLoader if image_weights else InfiniteDataLoader # only DataLoader allows for attribute updates generator = torch.Generator() generator.manual_seed(6148914691236517205 + seed + RANK) return loader(dataset, batch_size=batch_size, shuffle=shuffle and sampler is None, num_workers=nw, sampler=sampler, pin_memory=PIN_MEMORY, collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn, worker_init_fn=seed_worker, generator=generator), dataset class InfiniteDataLoader(dataloader.DataLoader): """ Dataloader that reuses workers Uses same syntax as vanilla DataLoader """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler)) self.iterator = super().__iter__() def __len__(self): return len(self.batch_sampler.sampler) def __iter__(self): for _ in range(len(self)): yield next(self.iterator) class _RepeatSampler: """ Sampler that repeats forever Args: sampler (Sampler) """ def __init__(self, sampler): self.sampler = sampler def __iter__(self): while True: yield from iter(self.sampler) class LoadScreenshots: # YOLOv5 screenshot dataloader, i.e. `python detect.py --source "screen 0 100 100 512 256"` def __init__(self, source, img_size=640, stride=32, auto=True, transforms=None): # source = [screen_number left top width height] (pixels) check_requirements('mss') import mss source, *params = source.split() self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0 if len(params) == 1: self.screen = int(params[0]) elif len(params) == 4: left, top, width, height = (int(x) for x in params) elif len(params) == 5: self.screen, left, top, width, height = (int(x) for x in params) self.img_size = img_size self.stride = stride self.transforms = transforms self.auto = auto self.mode = 'stream' self.frame = 0 self.sct = mss.mss() # Parse monitor shape monitor = self.sct.monitors[self.screen] self.top = monitor['top'] if top is None else (monitor['top'] + top) self.left = monitor['left'] if left is None else (monitor['left'] + left) self.width = width or monitor['width'] self.height = height or monitor['height'] self.monitor = {'left': self.left, 'top': self.top, 'width': self.width, 'height': self.height} def __iter__(self): return self def __next__(self): # mss screen capture: get raw pixels from the screen as np array im0 = np.array(self.sct.grab(self.monitor))[:, :, :3] # [:, :, :3] BGRA to BGR s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: ' if self.transforms: im = self.transforms(im0) # transforms else: im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0] # padded resize im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB im = np.ascontiguousarray(im) # contiguous self.frame += 1 return str(self.screen), im, im0, None, s # screen, img, original img, im0s, s class LoadImages: # YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4` def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vid_stride=1): if isinstance(path, str) and Path(path).suffix == '.txt': # *.txt file with img/vid/dir on each line path = Path(path).read_text().rsplit() files = [] for p in sorted(path) if isinstance(path, (list, tuple)) else [path]: p = str(Path(p).resolve()) if '*' in p: files.extend(sorted(glob.glob(p, recursive=True))) # glob elif os.path.isdir(p): files.extend(sorted(glob.glob(os.path.join(p, '*.*')))) # dir elif os.path.isfile(p): files.append(p) # files else: raise FileNotFoundError(f'{p} does not exist') images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS] videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS] ni, nv = len(images), len(videos) self.img_size = img_size self.stride = stride self.files = images + videos self.nf = ni + nv # number of files self.video_flag = [False] * ni + [True] * nv self.mode = 'image' self.auto = auto self.transforms = transforms # optional self.vid_stride = vid_stride # video frame-rate stride if any(videos): self._new_video(videos[0]) # new video else: self.cap = None assert self.nf > 0, f'No images or videos found in {p}. ' \ f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}' def __iter__(self): self.count = 0 return self def __next__(self): if self.count == self.nf: raise StopIteration path = self.files[self.count] if self.video_flag[self.count]: # Read video self.mode = 'video' for _ in range(self.vid_stride): self.cap.grab() ret_val, im0 = self.cap.retrieve() while not ret_val: self.count += 1 self.cap.release() if self.count == self.nf: # last video raise StopIteration path = self.files[self.count] self._new_video(path) ret_val, im0 = self.cap.read() self.frame += 1 # im0 = self._cv2_rotate(im0) # for use if cv2 autorotation is False s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: ' else: # Read image self.count += 1 im0 = cv2.imread(path) # BGR assert im0 is not None, f'Image Not Found {path}' s = f'image {self.count}/{self.nf} {path}: ' if self.transforms: im = self.transforms(im0) # transforms else: im = letterbox(im0, self.img_size, stride=self.stride, auto=self.auto)[0] # padded resize im = im.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB im = np.ascontiguousarray(im) # contiguous return path, im, im0, self.cap, s def _new_video(self, path): # Create a new video capture object self.frame = 0 self.cap = cv2.VideoCapture(path) self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride) self.orientation = int(self.cap.get(cv2.CAP_PROP_ORIENTATION_META)) # rotation degrees # self.cap.set(cv2.CAP_PROP_ORIENTATION_AUTO, 0) # disable https://github.com/ultralytics/yolov5/issues/8493 def _cv2_rotate(self, im): # Rotate a cv2 video manually if self.orientation == 0: return cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE) elif self.orientation == 180: return cv2.rotate(im, cv2.ROTATE_90_COUNTERCLOCKWISE) elif self.orientation == 90: return cv2.rotate(im, cv2.ROTATE_180) return im def __len__(self): return self.nf # number of files class LoadStreams: # YOLOv5 streamloader, i.e. `python detect.py --source 'rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP streams` def __init__(self, sources='file.streams', img_size=640, stride=32, auto=True, transforms=None, vid_stride=1): torch.backends.cudnn.benchmark = True # faster for fixed-size inference self.mode = 'stream' self.img_size = img_size self.stride = stride self.vid_stride = vid_stride # video frame-rate stride sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources] n = len(sources) self.sources = [clean_str(x) for x in sources] # clean source names for later self.imgs, self.fps, self.frames, self.threads = [None] * n, [0] * n, [0] * n, [None] * n for i, s in enumerate(sources): # index, source # Start thread to read frames from video stream st = f'{i + 1}/{n}: {s}... ' if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'): # if source is YouTube video # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/Zgi9g1ksQHc' check_requirements(('pafy', 'youtube_dl==2020.12.2')) import pafy s = pafy.new(s).getbest(preftype='mp4').url # YouTube URL s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam if s == 0: assert not is_colab(), '--source 0 webcam unsupported on Colab. Rerun command in a local environment.' assert not is_kaggle(), '--source 0 webcam unsupported on Kaggle. Rerun command in a local environment.' cap = cv2.VideoCapture(s) assert cap.isOpened(), f'{st}Failed to open {s}' w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = cap.get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float('inf') # infinite stream fallback self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback _, self.imgs[i] = cap.read() # guarantee first frame self.threads[i] = Thread(target=self.update, args=([i, cap, s]), daemon=True) LOGGER.info(f'{st} Success ({self.frames[i]} frames {w}x{h} at {self.fps[i]:.2f} FPS)') self.threads[i].start() LOGGER.info('') # newline # check for common shapes s = np.stack([letterbox(x, img_size, stride=stride, auto=auto)[0].shape for x in self.imgs]) self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal self.auto = auto and self.rect self.transforms = transforms # optional if not self.rect: LOGGER.warning('WARNING ⚠️ Stream shapes differ. For optimal performance supply similarly-shaped streams.') def update(self, i, cap, stream): # Read stream `i` frames in daemon thread n, f = 0, self.frames[i] # frame number, frame array while cap.isOpened() and n < f: n += 1 cap.grab() # .read() = .grab() followed by .retrieve() if n % self.vid_stride == 0: success, im = cap.retrieve() if success: self.imgs[i] = im else: LOGGER.warning('WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.') self.imgs[i] = np.zeros_like(self.imgs[i]) cap.open(stream) # re-open stream if signal was lost time.sleep(0.0) # wait time def __iter__(self): self.count = -1 return self def __next__(self): self.count += 1 if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'): # q to quit cv2.destroyAllWindows() raise StopIteration im0 = self.imgs.copy() if self.transforms: im = np.stack([self.transforms(x) for x in im0]) # transforms else: im = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0] for x in im0]) # resize im = im[..., ::-1].transpose((0, 3, 1, 2)) # BGR to RGB, BHWC to BCHW im = np.ascontiguousarray(im) # contiguous return self.sources, im, im0, None, '' def __len__(self): return len(self.sources) # 1E12 frames = 32 streams at 30 FPS for 30 years def img2label_paths(img_paths): # Define label paths as a function of image paths sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}labels{os.sep}' # /images/, /labels/ substrings return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths] class LoadImagesAndLabels(Dataset): # YOLOv5 train_loader/val_loader, loads images and labels for training and validation cache_version = 0.6 # dataset labels *.cache version rand_interp_methods = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4] def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, cache_images=False, single_cls=False, stride=32, pad=0.0, min_items=0, prefix=''): self.img_size = img_size self.augment = augment self.hyp = hyp self.image_weights = image_weights self.rect = False if image_weights else rect self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training) self.mosaic_border = [-img_size // 2, -img_size // 2] self.stride = stride self.path = path self.albumentations = Albumentations(size=img_size) if augment else None try: f = [] # image files for p in path if isinstance(path, list) else [path]: p = Path(p) # os-agnostic if p.is_dir(): # dir f += glob.glob(str(p / '**' / '*.*'), recursive=True) # f = list(p.rglob('*.*')) # pathlib elif p.is_file(): # file with open(p) as t: t = t.read().strip().splitlines() parent = str(p.parent) + os.sep f += [x.replace('./', parent, 1) if x.startswith('./') else x for x in t] # to global path # f += [p.parent / x.lstrip(os.sep) for x in t] # to global path (pathlib) else: raise FileNotFoundError(f'{prefix}{p} does not exist') self.im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS) # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib assert self.im_files, f'{prefix}No images found' except Exception as e: raise Exception(f'{prefix}Error loading data from {path}: {e}\n{HELP_URL}') from e # Check cache self.label_files = img2label_paths(self.im_files) # labels cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache') try: cache, exists = np.load(cache_path, allow_pickle=True).item(), True # load dict assert cache['version'] == self.cache_version # matches current version assert cache['hash'] == get_hash(self.label_files + self.im_files) # identical hash except Exception: cache, exists = self.cache_labels(cache_path, prefix), False # run cache ops # Display cache nf, nm, ne, nc, n = cache.pop('results') # found, missing, empty, corrupt, total if exists and LOCAL_RANK in {-1, 0}: d = f'Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt' tqdm(None, desc=prefix + d, total=n, initial=n, bar_format=TQDM_BAR_FORMAT) # display cache results if cache['msgs']: LOGGER.info('\n'.join(cache['msgs'])) # display warnings assert nf > 0 or not augment, f'{prefix}No labels found in {cache_path}, can not start training. {HELP_URL}' # Read cache [cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items labels, shapes, self.segments = zip(*cache.values()) nl = len(np.concatenate(labels, 0)) # number of labels assert nl > 0 or not augment, f'{prefix}All labels empty in {cache_path}, can not start training. {HELP_URL}' self.labels = list(labels) self.shapes = np.array(shapes) self.im_files = list(cache.keys()) # update self.label_files = img2label_paths(cache.keys()) # update # Filter images if min_items: include = np.array([len(x) >= min_items for x in self.labels]).nonzero()[0].astype(int) LOGGER.info(f'{prefix}{n - len(include)}/{n} images filtered from dataset') self.im_files = [self.im_files[i] for i in include] self.label_files = [self.label_files[i] for i in include] self.labels = [self.labels[i] for i in include] self.segments = [self.segments[i] for i in include] self.shapes = self.shapes[include] # wh # Create indices n = len(self.shapes) # number of images bi = np.floor(np.arange(n) / batch_size).astype(int) # batch index nb = bi[-1] + 1 # number of batches self.batch = bi # batch index of image self.n = n self.indices = range(n) # Update labels include_class = [] # filter labels to include only these classes (optional) include_class_array = np.array(include_class).reshape(1, -1) for i, (label, segment) in enumerate(zip(self.labels, self.segments)): if include_class: j = (label[:, 0:1] == include_class_array).any(1) self.labels[i] = label[j] if segment: self.segments[i] = segment[j] if single_cls: # single-class training, merge all classes into 0 self.labels[i][:, 0] = 0 # Rectangular Training if self.rect: # Sort by aspect ratio s = self.shapes # wh ar = s[:, 1] / s[:, 0] # aspect ratio irect = ar.argsort() self.im_files = [self.im_files[i] for i in irect] self.label_files = [self.label_files[i] for i in irect] self.labels = [self.labels[i] for i in irect] self.segments = [self.segments[i] for i in irect] self.shapes = s[irect] # wh ar = ar[irect] # Set training image shapes shapes = [[1, 1]] * nb for i in range(nb): ari = ar[bi == i] mini, maxi = ari.min(), ari.max() if maxi < 1: shapes[i] = [maxi, 1] elif mini > 1: shapes[i] = [1, 1 / mini] self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(int) * stride # Cache images into RAM/disk for faster training if cache_images == 'ram' and not self.check_cache_ram(prefix=prefix): cache_images = False self.ims = [None] * n self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files] if cache_images: b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes self.im_hw0, self.im_hw = [None] * n, [None] * n fcn = self.cache_images_to_disk if cache_images == 'disk' else self.load_image results = ThreadPool(NUM_THREADS).imap(fcn, range(n)) pbar = tqdm(enumerate(results), total=n, bar_format=TQDM_BAR_FORMAT, disable=LOCAL_RANK > 0) for i, x in pbar: if cache_images == 'disk': b += self.npy_files[i].stat().st_size else: # 'ram' self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i) b += self.ims[i].nbytes pbar.desc = f'{prefix}Caching images ({b / gb:.1f}GB {cache_images})' pbar.close() def check_cache_ram(self, safety_margin=0.1, prefix=''): # Check image caching requirements vs available memory b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes n = min(self.n, 30) # extrapolate from 30 random images for _ in range(n): im = cv2.imread(random.choice(self.im_files)) # sample image ratio = self.img_size / max(im.shape[0], im.shape[1]) # max(h, w) # ratio b += im.nbytes * ratio ** 2 mem_required = b * self.n / n # GB required to cache dataset into RAM mem = psutil.virtual_memory() cache = mem_required * (1 + safety_margin) < mem.available # to cache or not to cache, that is the question if not cache: LOGGER.info(f'{prefix}{mem_required / gb:.1f}GB RAM required, ' f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, ' f"{'caching images ✅' if cache else 'not caching images ⚠️'}") return cache def cache_labels(self, path=Path('./labels.cache'), prefix=''): # Cache dataset labels, check images and read shapes x = {} # dict nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages desc = f'{prefix}Scanning {path.parent / path.stem}...' with Pool(NUM_THREADS) as pool: pbar = tqdm(pool.imap(verify_image_label, zip(self.im_files, self.label_files, repeat(prefix))), desc=desc, total=len(self.im_files), bar_format=TQDM_BAR_FORMAT) for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar: nm += nm_f nf += nf_f ne += ne_f nc += nc_f if im_file: x[im_file] = [lb, shape, segments] if msg: msgs.append(msg) pbar.desc = f'{desc} {nf} images, {nm + ne} backgrounds, {nc} corrupt' pbar.close() if msgs: LOGGER.info('\n'.join(msgs)) if nf == 0: LOGGER.warning(f'{prefix}WARNING ⚠️ No labels found in {path}. {HELP_URL}') x['hash'] = get_hash(self.label_files + self.im_files) x['results'] = nf, nm, ne, nc, len(self.im_files) x['msgs'] = msgs # warnings x['version'] = self.cache_version # cache version try: np.save(path, x) # save cache for next time path.with_suffix('.cache.npy').rename(path) # remove .npy suffix LOGGER.info(f'{prefix}New cache created: {path}') except Exception as e: LOGGER.warning(f'{prefix}WARNING ⚠️ Cache directory {path.parent} is not writeable: {e}') # not writeable return x def __len__(self): return len(self.im_files) # def __iter__(self): # self.count = -1 # print('ran dataset iter') # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF) # return self def __getitem__(self, index): index = self.indices[index] # linear, shuffled, or image_weights hyp = self.hyp mosaic = self.mosaic and random.random() < hyp['mosaic'] if mosaic: # Load mosaic img, labels = self.load_mosaic(index) shapes = None # MixUp augmentation if random.random() < hyp['mixup']: img, labels = mixup(img, labels, *self.load_mosaic(random.randint(0, self.n - 1))) else: # Load image img, (h0, w0), (h, w) = self.load_image(index) # Letterbox shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling labels = self.labels[index].copy() if labels.size: # normalized xywh to pixel xyxy format labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1]) if self.augment: img, labels = random_perspective(img, labels, degrees=hyp['degrees'], translate=hyp['translate'], scale=hyp['scale'], shear=hyp['shear'], perspective=hyp['perspective']) nl = len(labels) # number of labels if nl: labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1E-3) if self.augment: # Albumentations img, labels = self.albumentations(img, labels) nl = len(labels) # update after albumentations # HSV color-space augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) # Flip up-down if random.random() < hyp['flipud']: img = np.flipud(img) if nl: labels[:, 2] = 1 - labels[:, 2] # Flip left-right if random.random() < hyp['fliplr']: img = np.fliplr(img) if nl: labels[:, 1] = 1 - labels[:, 1] # Cutouts # labels = cutout(img, labels, p=0.5) # nl = len(labels) # update after cutout labels_out = torch.zeros((nl, 6)) if nl: labels_out[:, 1:] = torch.from_numpy(labels) # Convert img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB img = np.ascontiguousarray(img) return torch.from_numpy(img), labels_out, self.im_files[index], shapes def load_image(self, i): # Loads 1 image from dataset index 'i', returns (im, original hw, resized hw) im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i], if im is None: # not cached in RAM if fn.exists(): # load npy im = np.load(fn) else: # read image im = cv2.imread(f) # BGR assert im is not None, f'Image Not Found {f}' h0, w0 = im.shape[:2] # orig hw r = self.img_size / max(h0, w0) # ratio if r != 1: # if sizes are not equal interp = cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA im = cv2.resize(im, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp) return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized return self.ims[i], self.im_hw0[i], self.im_hw[i] # im, hw_original, hw_resized def cache_images_to_disk(self, i): # Saves an image as an *.npy file for faster loading f = self.npy_files[i] if not f.exists(): np.save(f.as_posix(), cv2.imread(self.im_files[i])) def load_mosaic(self, index): # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic labels4, segments4 = [], [] s = self.img_size yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border) # mosaic center x, y indices = [index] + random.choices(self.indices, k=3) # 3 additional image indices random.shuffle(indices) for i, index in enumerate(indices): # Load image img, _, (h, w) = self.load_image(index) # place img in img4 if i == 0: # top left img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) elif i == 1: # top right x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h elif i == 2: # bottom left x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) elif i == 3: # bottom right x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] padw = x1a - x1b padh = y1a - y1b # Labels labels, segments = self.labels[index].copy(), self.segments[index].copy() if labels.size: labels[:, 1:] = xywhn2xyxy(labels[:, 1:], w, h, padw, padh) # normalized xywh to pixel xyxy format segments = [xyn2xy(x, w, h, padw, padh) for x in segments] labels4.append(labels) segments4.extend(segments) # Concat/clip labels labels4 = np.concatenate(labels4, 0) for x in (labels4[:, 1:], *segments4): np.clip(x, 0, 2 * s, out=x) # clip when using random_perspective() # img4, labels4 = replicate(img4, labels4) # replicate # Augment img4, labels4, segments4 = copy_paste(img4, labels4, segments4, p=self.hyp['copy_paste']) img4, labels4 = random_perspective(img4, labels4, segments4, degrees=self.hyp['degrees'], translate=self.hyp['translate'], scale=self.hyp['scale'], shear=self.hyp['shear'], perspective=self.hyp['perspective'], border=self.mosaic_border) # border to remove return img4, labels4 def load_mosaic9(self, index): # YOLOv5 9-mosaic loader. Loads 1 image + 8 random images into a 9-image mosaic labels9, segments9 = [], [] s = self.img_size indices = [index] + random.choices(self.indices, k=8) # 8 additional image indices random.shuffle(indices) hp, wp = -1, -1 # height, width previous for i, index in enumerate(indices): # Load image img, _, (h, w) = self.load_image(index) # place img in img9 if i == 0: # center img9 = np.full((s * 3, s * 3, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles h0, w0 = h, w c = s, s, s + w, s + h # xmin, ymin, xmax, ymax (base) coordinates elif i == 1: # top c = s, s - h, s + w, s elif i == 2: # top right c = s + wp, s - h, s + wp + w, s elif i == 3: # right c = s + w0, s, s + w0 + w, s + h elif i == 4: # bottom right c = s + w0, s + hp, s + w0 + w, s + hp + h elif i == 5: # bottom c = s + w0 - w, s + h0, s + w0, s + h0 + h elif i == 6: # bottom left c = s + w0 - wp - w, s + h0, s + w0 - wp, s + h0 + h elif i == 7: # left c = s - w, s + h0 - h, s, s + h0 elif i == 8: # top left c = s - w, s + h0 - hp - h, s, s + h0 - hp padx, pady = c[:2] x1, y1, x2, y2 = (max(x, 0) for x in c) # allocate coords # Labels labels, segments = self.labels[index].copy(), self.segments[index].copy() if labels.size: labels[:, 1:] = xywhn2xyxy(labels[:, 1:], w, h, padx, pady) # normalized xywh to pixel xyxy format segments = [xyn2xy(x, w, h, padx, pady) for x in segments] labels9.append(labels) segments9.extend(segments) # Image img9[y1:y2, x1:x2] = img[y1 - pady:, x1 - padx:] # img9[ymin:ymax, xmin:xmax] hp, wp = h, w # height, width previous # Offset yc, xc = (int(random.uniform(0, s)) for _ in self.mosaic_border) # mosaic center x, y img9 = img9[yc:yc + 2 * s, xc:xc + 2 * s] # Concat/clip labels labels9 = np.concatenate(labels9, 0) labels9[:, [1, 3]] -= xc labels9[:, [2, 4]] -= yc c = np.array([xc, yc]) # centers segments9 = [x - c for x in segments9] for x in (labels9[:, 1:], *segments9): np.clip(x, 0, 2 * s, out=x) # clip when using random_perspective() # img9, labels9 = replicate(img9, labels9) # replicate # Augment img9, labels9, segments9 = copy_paste(img9, labels9, segments9, p=self.hyp['copy_paste']) img9, labels9 = random_perspective(img9, labels9, segments9, degrees=self.hyp['degrees'], translate=self.hyp['translate'], scale=self.hyp['scale'], shear=self.hyp['shear'], perspective=self.hyp['perspective'], border=self.mosaic_border) # border to remove return img9, labels9 @staticmethod def collate_fn(batch): im, label, path, shapes = zip(*batch) # transposed for i, lb in enumerate(label): lb[:, 0] = i # add target image index for build_targets() return torch.stack(im, 0), torch.cat(label, 0), path, shapes @staticmethod def collate_fn4(batch): im, label, path, shapes = zip(*batch) # transposed n = len(shapes) // 4 im4, label4, path4, shapes4 = [], [], path[:n], shapes[:n] ho = torch.tensor([[0.0, 0, 0, 1, 0, 0]]) wo = torch.tensor([[0.0, 0, 1, 0, 0, 0]]) s = torch.tensor([[1, 1, 0.5, 0.5, 0.5, 0.5]]) # scale for i in range(n): # zidane torch.zeros(16,3,720,1280) # BCHW i *= 4 if random.random() < 0.5: im1 = F.interpolate(im[i].unsqueeze(0).float(), scale_factor=2.0, mode='bilinear', align_corners=False)[0].type(im[i].type()) lb = label[i] else: im1 = torch.cat((torch.cat((im[i], im[i + 1]), 1), torch.cat((im[i + 2], im[i + 3]), 1)), 2) lb = torch.cat((label[i], label[i + 1] + ho, label[i + 2] + wo, label[i + 3] + ho + wo), 0) * s im4.append(im1) label4.append(lb) for i, lb in enumerate(label4): lb[:, 0] = i # add target image index for build_targets() return torch.stack(im4, 0), torch.cat(label4, 0), path4, shapes4 # Ancillary functions -------------------------------------------------------------------------------------------------- def flatten_recursive(path=DATASETS_DIR / 'coco128'): # Flatten a recursive directory by bringing all files to top level new_path = Path(f'{str(path)}_flat') if os.path.exists(new_path): shutil.rmtree(new_path) # delete output folder os.makedirs(new_path) # make new output folder for file in tqdm(glob.glob(f'{str(Path(path))}/**/*.*', recursive=True)): shutil.copyfile(file, new_path / Path(file).name) def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.dataloaders import *; extract_boxes() # Convert detection dataset into classification dataset, with one directory per class path = Path(path) # images dir shutil.rmtree(path / 'classification') if (path / 'classification').is_dir() else None # remove existing files = list(path.rglob('*.*')) n = len(files) # number of files for im_file in tqdm(files, total=n): if im_file.suffix[1:] in IMG_FORMATS: # image im = cv2.imread(str(im_file))[..., ::-1] # BGR to RGB h, w = im.shape[:2] # labels lb_file = Path(img2label_paths([str(im_file)])[0]) if Path(lb_file).exists(): with open(lb_file) as f: lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels for j, x in enumerate(lb): c = int(x[0]) # class f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg' # new filename if not f.parent.is_dir(): f.parent.mkdir(parents=True) b = x[1:] * [w, h, w, h] # box # b[2:] = b[2:].max() # rectangle to square b[2:] = b[2:] * 1.2 + 3 # pad b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(int) b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image b[[1, 3]] = np.clip(b[[1, 3]], 0, h) assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}' def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False): """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files Usage: from utils.dataloaders import *; autosplit() Arguments path: Path to images directory weights: Train, val, test weights (list, tuple) annotated_only: Only use images with an annotated txt file """ path = Path(path) # images dir files = sorted(x for x in path.rglob('*.*') if x.suffix[1:].lower() in IMG_FORMATS) # image files only n = len(files) # number of files random.seed(0) # for reproducibility indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files for x in txt: if (path.parent / x).exists(): (path.parent / x).unlink() # remove existing print(f'Autosplitting images from {path}' + ', using *.txt labeled images only' * annotated_only) for i, img in tqdm(zip(indices, files), total=n): if not annotated_only or Path(img2label_paths([str(img)])[0]).exists(): # check label with open(path.parent / txt[i], 'a') as f: f.write(f'./{img.relative_to(path.parent).as_posix()}' + '\n') # add image to txt file def verify_image_label(args): # Verify one image-label pair im_file, lb_file, prefix = args nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments try: # verify images im = Image.open(im_file) im.verify() # PIL verify shape = exif_size(im) # image size assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels' assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}' if im.format.lower() in ('jpg', 'jpeg'): with open(im_file, 'rb') as f: f.seek(-2, 2) if f.read() != b'\xff\xd9': # corrupt JPEG ImageOps.exif_transpose(Image.open(im_file)).save(im_file, 'JPEG', subsampling=0, quality=100) msg = f'{prefix}WARNING ⚠️ {im_file}: corrupt JPEG restored and saved' # verify labels if os.path.isfile(lb_file): nf = 1 # label found with open(lb_file) as f: lb = [x.split() for x in f.read().strip().splitlines() if len(x)] if any(len(x) > 6 for x in lb): # is segment classes = np.array([x[0] for x in lb], dtype=np.float32) segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in lb] # (cls, xy1...) lb = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh) lb = np.array(lb, dtype=np.float32) nl = len(lb) if nl: assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected' assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}' assert (lb[:, 1:] <= 1).all(), f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}' _, i = np.unique(lb, axis=0, return_index=True) if len(i) < nl: # duplicate row check lb = lb[i] # remove duplicates if segments: segments = [segments[x] for x in i] msg = f'{prefix}WARNING ⚠️ {im_file}: {nl - len(i)} duplicate labels removed' else: ne = 1 # label empty lb = np.zeros((0, 5), dtype=np.float32) else: nm = 1 # label missing lb = np.zeros((0, 5), dtype=np.float32) return im_file, lb, shape, segments, nm, nf, ne, nc, msg except Exception as e: nc = 1 msg = f'{prefix}WARNING ⚠️ {im_file}: ignoring corrupt image/label: {e}' return [None, None, None, None, nm, nf, ne, nc, msg] class HUBDatasetStats(): """ Class for generating HUB dataset JSON and `-hub` dataset directory Arguments path: Path to data.yaml or data.zip (with data.yaml inside data.zip) autodownload: Attempt to download dataset if not found locally Usage from utils.dataloaders import HUBDatasetStats stats = HUBDatasetStats('coco128.yaml', autodownload=True) # usage 1 stats = HUBDatasetStats('path/to/coco128.zip') # usage 2 stats.get_json(save=False) stats.process_images() """ def __init__(self, path='coco128.yaml', autodownload=False): # Initialize class zipped, data_dir, yaml_path = self._unzip(Path(path)) try: with open(check_yaml(yaml_path), errors='ignore') as f: data = yaml.safe_load(f) # data dict if zipped: data['path'] = data_dir except Exception as e: raise Exception('error/HUB/dataset_stats/yaml_load') from e check_dataset(data, autodownload) # download dataset if missing self.hub_dir = Path(data['path'] + '-hub') self.im_dir = self.hub_dir / 'images' self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images self.stats = {'nc': data['nc'], 'names': list(data['names'].values())} # statistics dictionary self.data = data @staticmethod def _find_yaml(dir): # Return data.yaml file files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive assert files, f'No *.yaml file found in {dir}' if len(files) > 1: files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed' assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}' return files[0] def _unzip(self, path): # Unzip data.zip if not str(path).endswith('.zip'): # path is data.yaml return False, None, path assert Path(path).is_file(), f'Error unzipping {path}, file not found' unzip_file(path, path=path.parent) dir = path.with_suffix('') # dataset directory == zip name assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/' return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path def _hub_ops(self, f, max_dim=1920): # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing f_new = self.im_dir / Path(f).name # dataset-hub image filename try: # use PIL im = Image.open(f) r = max_dim / max(im.height, im.width) # ratio if r < 1.0: # image too large im = im.resize((int(im.width * r), int(im.height * r))) im.save(f_new, 'JPEG', quality=50, optimize=True) # save except Exception as e: # use OpenCV LOGGER.info(f'WARNING ⚠️ HUB ops PIL failure {f}: {e}') im = cv2.imread(f) im_height, im_width = im.shape[:2] r = max_dim / max(im_height, im_width) # ratio if r < 1.0: # image too large im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA) cv2.imwrite(str(f_new), im) def get_json(self, save=False, verbose=False): # Return dataset JSON for Ultralytics HUB def _round(labels): # Update labels to integer class and 6 decimal place floats return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels] for split in 'train', 'val', 'test': if self.data.get(split) is None: self.stats[split] = None # i.e. no test set continue dataset = LoadImagesAndLabels(self.data[split]) # load dataset x = np.array([ np.bincount(label[:, 0].astype(int), minlength=self.data['nc']) for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')]) # shape(128x80) self.stats[split] = { 'instance_stats': { 'total': int(x.sum()), 'per_class': x.sum(0).tolist()}, 'image_stats': { 'total': dataset.n, 'unlabelled': int(np.all(x == 0, 1).sum()), 'per_class': (x > 0).sum(0).tolist()}, 'labels': [{ str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} # Save, print and return if save: stats_path = self.hub_dir / 'stats.json' print(f'Saving {stats_path.resolve()}...') with open(stats_path, 'w') as f: json.dump(self.stats, f) # save stats.json if verbose: print(json.dumps(self.stats, indent=2, sort_keys=False)) return self.stats def process_images(self): # Compress images for Ultralytics HUB for split in 'train', 'val', 'test': if self.data.get(split) is None: continue dataset = LoadImagesAndLabels(self.data[split]) # load dataset desc = f'{split} images' for _ in tqdm(ThreadPool(NUM_THREADS).imap(self._hub_ops, dataset.im_files), total=dataset.n, desc=desc): pass print(f'Done. All images saved to {self.im_dir}') return self.im_dir # Classification dataloaders ------------------------------------------------------------------------------------------- class ClassificationDataset(torchvision.datasets.ImageFolder): """ YOLOv5 Classification Dataset. Arguments root: Dataset path transform: torchvision transforms, used by default album_transform: Albumentations transforms, used if installed """ def __init__(self, root, augment, imgsz, cache=False): super().__init__(root=root) self.torch_transforms = classify_transforms(imgsz) self.album_transforms = classify_albumentations(augment, imgsz) if augment else None self.cache_ram = cache is True or cache == 'ram' self.cache_disk = cache == 'disk' self.samples = [list(x) + [Path(x[0]).with_suffix('.npy'), None] for x in self.samples] # file, index, npy, im def __getitem__(self, i): f, j, fn, im = self.samples[i] # filename, index, filename.with_suffix('.npy'), image if self.cache_ram and im is None: im = self.samples[i][3] = cv2.imread(f) elif self.cache_disk: if not fn.exists(): # load npy np.save(fn.as_posix(), cv2.imread(f)) im = np.load(fn) else: # read image im = cv2.imread(f) # BGR if self.album_transforms: sample = self.album_transforms(image=cv2.cvtColor(im, cv2.COLOR_BGR2RGB))['image'] else: sample = self.torch_transforms(im) return sample, j def create_classification_dataloader(path, imgsz=224, batch_size=16, augment=True, cache=False, rank=-1, workers=8, shuffle=True): # Returns Dataloader object to be used with YOLOv5 Classifier with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP dataset = ClassificationDataset(root=path, imgsz=imgsz, augment=augment, cache=cache) batch_size = min(batch_size, len(dataset)) nd = torch.cuda.device_count() nw = min([os.cpu_count() // max(nd, 1), batch_size if batch_size > 1 else 0, workers]) sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle) generator = torch.Generator() generator.manual_seed(6148914691236517205 + RANK) return InfiniteDataLoader(dataset, batch_size=batch_size, shuffle=shuffle and sampler is None, num_workers=nw, sampler=sampler, pin_memory=PIN_MEMORY, worker_init_fn=seed_worker, generator=generator) # or DataLoader(persistent_workers=True) ================================================ FILE: yolo-improve/yolov5-AUX/utils/docker/Dockerfile ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Builds ultralytics/yolov5:latest image on DockerHub https://hub.docker.com/r/ultralytics/yolov5 # Image is CUDA-optimized for YOLOv5 single/multi-GPU training and inference # Start FROM NVIDIA PyTorch image https://ngc.nvidia.com/catalog/containers/nvidia:pytorch # FROM docker.io/pytorch/pytorch:latest FROM pytorch/pytorch:latest # Downloads to user config dir ADD https://ultralytics.com/assets/Arial.ttf https://ultralytics.com/assets/Arial.Unicode.ttf /root/.config/Ultralytics/ # Install linux packages ENV DEBIAN_FRONTEND noninteractive RUN apt update RUN TZ=Etc/UTC apt install -y tzdata RUN apt install --no-install-recommends -y gcc git zip curl htop libgl1-mesa-glx libglib2.0-0 libpython3-dev gnupg # RUN alias python=python3 # Security updates # https://security.snyk.io/vuln/SNYK-UBUNTU1804-OPENSSL-3314796 RUN apt upgrade --no-install-recommends -y openssl # Create working directory RUN rm -rf /usr/src/app && mkdir -p /usr/src/app WORKDIR /usr/src/app # Copy contents # COPY . /usr/src/app (issues as not a .git directory) RUN git clone https://github.com/ultralytics/yolov5 /usr/src/app # Install pip packages COPY requirements.txt . RUN python3 -m pip install --upgrade pip wheel RUN pip install --no-cache -r requirements.txt albumentations comet gsutil notebook \ coremltools onnx onnx-simplifier onnxruntime 'openvino-dev>=2022.3' # tensorflow tensorflowjs \ # Set environment variables ENV OMP_NUM_THREADS=1 # Cleanup ENV DEBIAN_FRONTEND teletype # Usage Examples ------------------------------------------------------------------------------------------------------- # Build and Push # t=ultralytics/yolov5:latest && sudo docker build -f utils/docker/Dockerfile -t $t . && sudo docker push $t # Pull and Run # t=ultralytics/yolov5:latest && sudo docker pull $t && sudo docker run -it --ipc=host --gpus all $t # Pull and Run with local directory access # t=ultralytics/yolov5:latest && sudo docker pull $t && sudo docker run -it --ipc=host --gpus all -v "$(pwd)"/datasets:/usr/src/datasets $t # Kill all # sudo docker kill $(sudo docker ps -q) # Kill all image-based # sudo docker kill $(sudo docker ps -qa --filter ancestor=ultralytics/yolov5:latest) # DockerHub tag update # t=ultralytics/yolov5:latest tnew=ultralytics/yolov5:v6.2 && sudo docker pull $t && sudo docker tag $t $tnew && sudo docker push $tnew # Clean up # sudo docker system prune -a --volumes # Update Ubuntu drivers # https://www.maketecheasier.com/install-nvidia-drivers-ubuntu/ # DDP test # python -m torch.distributed.run --nproc_per_node 2 --master_port 1 train.py --epochs 3 # GCP VM from Image # docker.io/ultralytics/yolov5:latest ================================================ FILE: yolo-improve/yolov5-AUX/utils/docker/Dockerfile-arm64 ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Builds ultralytics/yolov5:latest-arm64 image on DockerHub https://hub.docker.com/r/ultralytics/yolov5 # Image is aarch64-compatible for Apple M1 and other ARM architectures i.e. Jetson Nano and Raspberry Pi # Start FROM Ubuntu image https://hub.docker.com/_/ubuntu FROM arm64v8/ubuntu:rolling # Downloads to user config dir ADD https://ultralytics.com/assets/Arial.ttf https://ultralytics.com/assets/Arial.Unicode.ttf /root/.config/Ultralytics/ # Install linux packages ENV DEBIAN_FRONTEND noninteractive RUN apt update RUN TZ=Etc/UTC apt install -y tzdata RUN apt install --no-install-recommends -y python3-pip git zip curl htop gcc libgl1-mesa-glx libglib2.0-0 libpython3-dev # RUN alias python=python3 # Install pip packages COPY requirements.txt . RUN python3 -m pip install --upgrade pip wheel RUN pip install --no-cache -r requirements.txt albumentations gsutil notebook \ coremltools onnx onnxruntime # tensorflow-aarch64 tensorflowjs \ # Create working directory RUN mkdir -p /usr/src/app WORKDIR /usr/src/app # Copy contents # COPY . /usr/src/app (issues as not a .git directory) RUN git clone https://github.com/ultralytics/yolov5 /usr/src/app ENV DEBIAN_FRONTEND teletype # Usage Examples ------------------------------------------------------------------------------------------------------- # Build and Push # t=ultralytics/yolov5:latest-arm64 && sudo docker build --platform linux/arm64 -f utils/docker/Dockerfile-arm64 -t $t . && sudo docker push $t # Pull and Run # t=ultralytics/yolov5:latest-arm64 && sudo docker pull $t && sudo docker run -it --ipc=host -v "$(pwd)"/datasets:/usr/src/datasets $t ================================================ FILE: yolo-improve/yolov5-AUX/utils/docker/Dockerfile-cpu ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Builds ultralytics/yolov5:latest-cpu image on DockerHub https://hub.docker.com/r/ultralytics/yolov5 # Image is CPU-optimized for ONNX, OpenVINO and PyTorch YOLOv5 deployments # Start FROM Ubuntu image https://hub.docker.com/_/ubuntu FROM ubuntu:rolling # Downloads to user config dir ADD https://ultralytics.com/assets/Arial.ttf https://ultralytics.com/assets/Arial.Unicode.ttf /root/.config/Ultralytics/ # Install linux packages ENV DEBIAN_FRONTEND noninteractive RUN apt update RUN TZ=Etc/UTC apt install -y tzdata RUN apt install --no-install-recommends -y python3-pip git zip curl htop libgl1-mesa-glx libglib2.0-0 libpython3-dev gnupg # RUN alias python=python3 # Install pip packages COPY requirements.txt . RUN python3 -m pip install --upgrade pip wheel RUN pip install --no-cache -r requirements.txt albumentations gsutil notebook \ coremltools onnx onnx-simplifier onnxruntime 'openvino-dev>=2022.3' \ # tensorflow tensorflowjs \ --extra-index-url https://download.pytorch.org/whl/cpu # Create working directory RUN mkdir -p /usr/src/app WORKDIR /usr/src/app # Copy contents # COPY . /usr/src/app (issues as not a .git directory) RUN git clone https://github.com/ultralytics/yolov5 /usr/src/app ENV DEBIAN_FRONTEND teletype # Usage Examples ------------------------------------------------------------------------------------------------------- # Build and Push # t=ultralytics/yolov5:latest-cpu && sudo docker build -f utils/docker/Dockerfile-cpu -t $t . && sudo docker push $t # Pull and Run # t=ultralytics/yolov5:latest-cpu && sudo docker pull $t && sudo docker run -it --ipc=host -v "$(pwd)"/datasets:/usr/src/datasets $t ================================================ FILE: yolo-improve/yolov5-AUX/utils/downloads.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Download utils """ import logging import os import subprocess import urllib from pathlib import Path import requests import torch def is_url(url, check=True): # Check if string is URL and check if URL exists try: url = str(url) result = urllib.parse.urlparse(url) assert all([result.scheme, result.netloc]) # check if is url return (urllib.request.urlopen(url).getcode() == 200) if check else True # check if exists online except (AssertionError, urllib.request.HTTPError): return False def gsutil_getsize(url=''): # gs://bucket/file size https://cloud.google.com/storage/docs/gsutil/commands/du output = subprocess.check_output(['gsutil', 'du', url], shell=True, encoding='utf-8') if output: return int(output.split()[0]) return 0 def url_getsize(url='https://ultralytics.com/images/bus.jpg'): # Return downloadable file size in bytes response = requests.head(url, allow_redirects=True) return int(response.headers.get('content-length', -1)) def curl_download(url, filename, *, silent: bool = False) -> bool: """ Download a file from a url to a filename using curl. """ silent_option = 'sS' if silent else '' # silent proc = subprocess.run([ 'curl', '-#', f'-{silent_option}L', url, '--output', filename, '--retry', '9', '-C', '-',]) return proc.returncode == 0 def safe_download(file, url, url2=None, min_bytes=1E0, error_msg=''): # Attempts to download file from url or url2, checks and removes incomplete downloads < min_bytes from utils.general import LOGGER file = Path(file) assert_msg = f"Downloaded file '{file}' does not exist or size is < min_bytes={min_bytes}" try: # url1 LOGGER.info(f'Downloading {url} to {file}...') torch.hub.download_url_to_file(url, str(file), progress=LOGGER.level <= logging.INFO) assert file.exists() and file.stat().st_size > min_bytes, assert_msg # check except Exception as e: # url2 if file.exists(): file.unlink() # remove partial downloads LOGGER.info(f'ERROR: {e}\nRe-attempting {url2 or url} to {file}...') # curl download, retry and resume on fail curl_download(url2 or url, file) finally: if not file.exists() or file.stat().st_size < min_bytes: # check if file.exists(): file.unlink() # remove partial downloads LOGGER.info(f'ERROR: {assert_msg}\n{error_msg}') LOGGER.info('') def attempt_download(file, repo='ultralytics/yolov5', release='v7.0'): # Attempt file download from GitHub release assets if not found locally. release = 'latest', 'v7.0', etc. from utils.general import LOGGER def github_assets(repository, version='latest'): # Return GitHub repo tag (i.e. 'v7.0') and assets (i.e. ['yolov5s.pt', 'yolov5m.pt', ...]) if version != 'latest': version = f'tags/{version}' # i.e. tags/v7.0 response = requests.get(f'https://api.github.com/repos/{repository}/releases/{version}').json() # github api return response['tag_name'], [x['name'] for x in response['assets']] # tag, assets file = Path(str(file).strip().replace("'", '')) if not file.exists(): # URL specified name = Path(urllib.parse.unquote(str(file))).name # decode '%2F' to '/' etc. if str(file).startswith(('http:/', 'https:/')): # download url = str(file).replace(':/', '://') # Pathlib turns :// -> :/ file = name.split('?')[0] # parse authentication https://url.com/file.txt?auth... if Path(file).is_file(): LOGGER.info(f'Found {url} locally at {file}') # file already exists else: safe_download(file=file, url=url, min_bytes=1E5) return file # GitHub assets assets = [f'yolov5{size}{suffix}.pt' for size in 'nsmlx' for suffix in ('', '6', '-cls', '-seg')] # default try: tag, assets = github_assets(repo, release) except Exception: try: tag, assets = github_assets(repo) # latest release except Exception: try: tag = subprocess.check_output('git tag', shell=True, stderr=subprocess.STDOUT).decode().split()[-1] except Exception: tag = release file.parent.mkdir(parents=True, exist_ok=True) # make parent dir (if required) if name in assets: safe_download(file, url=f'https://github.com/{repo}/releases/download/{tag}/{name}', min_bytes=1E5, error_msg=f'{file} missing, try downloading from https://github.com/{repo}/releases/{tag}') return str(file) ================================================ FILE: yolo-improve/yolov5-AUX/utils/flask_rest_api/README.md ================================================ # Flask REST API [REST](https://en.wikipedia.org/wiki/Representational_state_transfer) [API](https://en.wikipedia.org/wiki/API)s are commonly used to expose Machine Learning (ML) models to other services. This folder contains an example REST API created using Flask to expose the YOLOv5s model from [PyTorch Hub](https://pytorch.org/hub/ultralytics_yolov5/). ## Requirements [Flask](https://palletsprojects.com/p/flask/) is required. Install with: ```shell $ pip install Flask ``` ## Run After Flask installation run: ```shell $ python3 restapi.py --port 5000 ``` Then use [curl](https://curl.se/) to perform a request: ```shell $ curl -X POST -F image=@zidane.jpg 'http://localhost:5000/v1/object-detection/yolov5s' ``` The model inference results are returned as a JSON response: ```json [ { "class": 0, "confidence": 0.8900438547, "height": 0.9318675399, "name": "person", "width": 0.3264600933, "xcenter": 0.7438579798, "ycenter": 0.5207948685 }, { "class": 0, "confidence": 0.8440024257, "height": 0.7155083418, "name": "person", "width": 0.6546785235, "xcenter": 0.427829951, "ycenter": 0.6334488392 }, { "class": 27, "confidence": 0.3771208823, "height": 0.3902671337, "name": "tie", "width": 0.0696444362, "xcenter": 0.3675483763, "ycenter": 0.7991207838 }, { "class": 27, "confidence": 0.3527112305, "height": 0.1540903747, "name": "tie", "width": 0.0336618312, "xcenter": 0.7814827561, "ycenter": 0.5065554976 } ] ``` An example python script to perform inference using [requests](https://docs.python-requests.org/en/master/) is given in `example_request.py` ================================================ FILE: yolo-improve/yolov5-AUX/utils/flask_rest_api/example_request.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Perform test request """ import pprint import requests DETECTION_URL = 'http://localhost:5000/v1/object-detection/yolov5s' IMAGE = 'zidane.jpg' # Read image with open(IMAGE, 'rb') as f: image_data = f.read() response = requests.post(DETECTION_URL, files={'image': image_data}).json() pprint.pprint(response) ================================================ FILE: yolo-improve/yolov5-AUX/utils/flask_rest_api/restapi.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Run a Flask REST API exposing one or more YOLOv5s models """ import argparse import io import torch from flask import Flask, request from PIL import Image app = Flask(__name__) models = {} DETECTION_URL = '/v1/object-detection/' @app.route(DETECTION_URL, methods=['POST']) def predict(model): if request.method != 'POST': return if request.files.get('image'): # Method 1 # with request.files["image"] as f: # im = Image.open(io.BytesIO(f.read())) # Method 2 im_file = request.files['image'] im_bytes = im_file.read() im = Image.open(io.BytesIO(im_bytes)) if model in models: results = models[model](im, size=640) # reduce size=320 for faster inference return results.pandas().xyxy[0].to_json(orient='records') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Flask API exposing YOLOv5 model') parser.add_argument('--port', default=5000, type=int, help='port number') parser.add_argument('--model', nargs='+', default=['yolov5s'], help='model(s) to run, i.e. --model yolov5n yolov5s') opt = parser.parse_args() for m in opt.model: models[m] = torch.hub.load('ultralytics/yolov5', m, force_reload=True, skip_validation=True) app.run(host='0.0.0.0', port=opt.port) # debug=True causes Restarting with stat ================================================ FILE: yolo-improve/yolov5-AUX/utils/general.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ General utils """ import contextlib import glob import inspect import logging import logging.config import math import os import platform import random import re import signal import subprocess import sys import time import urllib from copy import deepcopy from datetime import datetime from itertools import repeat from multiprocessing.pool import ThreadPool from pathlib import Path from subprocess import check_output from tarfile import is_tarfile from typing import Optional from zipfile import ZipFile, is_zipfile import cv2 import IPython import numpy as np import pandas as pd import pkg_resources as pkg import torch import torchvision import yaml from utils import TryExcept, emojis from utils.downloads import curl_download, gsutil_getsize from utils.metrics import box_iou, fitness FILE = Path(__file__).resolve() ROOT = FILE.parents[1] # YOLOv5 root directory RANK = int(os.getenv('RANK', -1)) # Settings NUM_THREADS = min(8, max(1, os.cpu_count() - 1)) # number of YOLOv5 multiprocessing threads DATASETS_DIR = Path(os.getenv('YOLOv5_DATASETS_DIR', ROOT.parent / 'datasets')) # global datasets directory AUTOINSTALL = str(os.getenv('YOLOv5_AUTOINSTALL', True)).lower() == 'true' # global auto-install mode VERBOSE = str(os.getenv('YOLOv5_VERBOSE', True)).lower() == 'true' # global verbose mode TQDM_BAR_FORMAT = '{l_bar}{bar:10}{r_bar}' # tqdm bar format FONT = 'Arial.ttf' # https://ultralytics.com/assets/Arial.ttf torch.set_printoptions(linewidth=320, precision=5, profile='long') np.set_printoptions(linewidth=320, formatter={'float_kind': '{:11.5g}'.format}) # format short g, %precision=5 pd.options.display.max_columns = 10 cv2.setNumThreads(0) # prevent OpenCV from multithreading (incompatible with PyTorch DataLoader) os.environ['NUMEXPR_MAX_THREADS'] = str(NUM_THREADS) # NumExpr max threads os.environ['OMP_NUM_THREADS'] = '1' if platform.system() == 'darwin' else str(NUM_THREADS) # OpenMP (PyTorch and SciPy) def is_ascii(s=''): # Is string composed of all ASCII (no UTF) characters? (note str().isascii() introduced in python 3.7) s = str(s) # convert list, tuple, None, etc. to str return len(s.encode().decode('ascii', 'ignore')) == len(s) def is_chinese(s='人工智能'): # Is string composed of any Chinese characters? return bool(re.search('[\u4e00-\u9fff]', str(s))) def is_colab(): # Is environment a Google Colab instance? return 'google.colab' in sys.modules def is_notebook(): # Is environment a Jupyter notebook? Verified on Colab, Jupyterlab, Kaggle, Paperspace ipython_type = str(type(IPython.get_ipython())) return 'colab' in ipython_type or 'zmqshell' in ipython_type def is_kaggle(): # Is environment a Kaggle Notebook? return os.environ.get('PWD') == '/kaggle/working' and os.environ.get('KAGGLE_URL_BASE') == 'https://www.kaggle.com' def is_docker() -> bool: """Check if the process runs inside a docker container.""" if Path('/.dockerenv').exists(): return True try: # check if docker is in control groups with open('/proc/self/cgroup') as file: return any('docker' in line for line in file) except OSError: return False def is_writeable(dir, test=False): # Return True if directory has write permissions, test opening a file with write permissions if test=True if not test: return os.access(dir, os.W_OK) # possible issues on Windows file = Path(dir) / 'tmp.txt' try: with open(file, 'w'): # open file with write permissions pass file.unlink() # remove file return True except OSError: return False LOGGING_NAME = 'yolov5' def set_logging(name=LOGGING_NAME, verbose=True): # sets up logging for the given name rank = int(os.getenv('RANK', -1)) # rank in world for Multi-GPU trainings level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': False, 'formatters': { name: { 'format': '%(message)s'}}, 'handlers': { name: { 'class': 'logging.StreamHandler', 'formatter': name, 'level': level,}}, 'loggers': { name: { 'level': level, 'handlers': [name], 'propagate': False,}}}) set_logging(LOGGING_NAME) # run before defining LOGGER LOGGER = logging.getLogger(LOGGING_NAME) # define globally (used in train.py, val.py, detect.py, etc.) if platform.system() == 'Windows': for fn in LOGGER.info, LOGGER.warning: setattr(LOGGER, fn.__name__, lambda x: fn(emojis(x))) # emoji safe logging def user_config_dir(dir='Ultralytics', env_var='YOLOV5_CONFIG_DIR'): # Return path of user configuration directory. Prefer environment variable if exists. Make dir if required. env = os.getenv(env_var) if env: path = Path(env) # use environment variable else: cfg = {'Windows': 'AppData/Roaming', 'Linux': '.config', 'Darwin': 'Library/Application Support'} # 3 OS dirs path = Path.home() / cfg.get(platform.system(), '') # OS-specific config dir path = (path if is_writeable(path) else Path('/tmp')) / dir # GCP and AWS lambda fix, only /tmp is writeable path.mkdir(exist_ok=True) # make if required return path CONFIG_DIR = user_config_dir() # Ultralytics settings dir class Profile(contextlib.ContextDecorator): # YOLOv5 Profile class. Usage: @Profile() decorator or 'with Profile():' context manager def __init__(self, t=0.0): self.t = t self.cuda = torch.cuda.is_available() def __enter__(self): self.start = self.time() return self def __exit__(self, type, value, traceback): self.dt = self.time() - self.start # delta-time self.t += self.dt # accumulate dt def time(self): if self.cuda: torch.cuda.synchronize() return time.time() class Timeout(contextlib.ContextDecorator): # YOLOv5 Timeout class. Usage: @Timeout(seconds) decorator or 'with Timeout(seconds):' context manager def __init__(self, seconds, *, timeout_msg='', suppress_timeout_errors=True): self.seconds = int(seconds) self.timeout_message = timeout_msg self.suppress = bool(suppress_timeout_errors) def _timeout_handler(self, signum, frame): raise TimeoutError(self.timeout_message) def __enter__(self): if platform.system() != 'Windows': # not supported on Windows signal.signal(signal.SIGALRM, self._timeout_handler) # Set handler for SIGALRM signal.alarm(self.seconds) # start countdown for SIGALRM to be raised def __exit__(self, exc_type, exc_val, exc_tb): if platform.system() != 'Windows': signal.alarm(0) # Cancel SIGALRM if it's scheduled if self.suppress and exc_type is TimeoutError: # Suppress TimeoutError return True class WorkingDirectory(contextlib.ContextDecorator): # Usage: @WorkingDirectory(dir) decorator or 'with WorkingDirectory(dir):' context manager def __init__(self, new_dir): self.dir = new_dir # new dir self.cwd = Path.cwd().resolve() # current dir def __enter__(self): os.chdir(self.dir) def __exit__(self, exc_type, exc_val, exc_tb): os.chdir(self.cwd) def methods(instance): # Get class/instance methods return [f for f in dir(instance) if callable(getattr(instance, f)) and not f.startswith('__')] def print_args(args: Optional[dict] = None, show_file=True, show_func=False): # Print function arguments (optional args dict) x = inspect.currentframe().f_back # previous frame file, _, func, _, _ = inspect.getframeinfo(x) if args is None: # get args automatically args, _, _, frm = inspect.getargvalues(x) args = {k: v for k, v in frm.items() if k in args} try: file = Path(file).resolve().relative_to(ROOT).with_suffix('') except ValueError: file = Path(file).stem s = (f'{file}: ' if show_file else '') + (f'{func}: ' if show_func else '') LOGGER.info(colorstr(s) + ', '.join(f'{k}={v}' for k, v in args.items())) def init_seeds(seed=0, deterministic=False): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # for Multi-GPU, exception safe # torch.backends.cudnn.benchmark = True # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287 if deterministic and check_version(torch.__version__, '1.12.0'): # https://github.com/ultralytics/yolov5/pull/8213 torch.use_deterministic_algorithms(True) torch.backends.cudnn.deterministic = True os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' os.environ['PYTHONHASHSEED'] = str(seed) def intersect_dicts(da, db, exclude=()): # Dictionary intersection of matching keys and shapes, omitting 'exclude' keys, using da values return {k: v for k, v in da.items() if k in db and all(x not in k for x in exclude) and v.shape == db[k].shape} def get_default_args(func): # Get func() default arguments signature = inspect.signature(func) return {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty} def get_latest_run(search_dir='.'): # Return path to most recent 'last.pt' in /runs (i.e. to --resume from) last_list = glob.glob(f'{search_dir}/**/last*.pt', recursive=True) return max(last_list, key=os.path.getctime) if last_list else '' def file_age(path=__file__): # Return days since last file update dt = (datetime.now() - datetime.fromtimestamp(Path(path).stat().st_mtime)) # delta return dt.days # + dt.seconds / 86400 # fractional days def file_date(path=__file__): # Return human-readable file modification date, i.e. '2021-3-26' t = datetime.fromtimestamp(Path(path).stat().st_mtime) return f'{t.year}-{t.month}-{t.day}' def file_size(path): # Return file/dir size (MB) mb = 1 << 20 # bytes to MiB (1024 ** 2) path = Path(path) if path.is_file(): return path.stat().st_size / mb elif path.is_dir(): return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file()) / mb else: return 0.0 def check_online(): # Check internet connectivity import socket def run_once(): # Check once try: socket.create_connection(('1.1.1.1', 443), 5) # check host accessibility return True except OSError: return False return run_once() or run_once() # check twice to increase robustness to intermittent connectivity issues def git_describe(path=ROOT): # path must be a directory # Return human-readable git description, i.e. v5.0-5-g3e25f1e https://git-scm.com/docs/git-describe try: assert (Path(path) / '.git').is_dir() return check_output(f'git -C {path} describe --tags --long --always', shell=True).decode()[:-1] except Exception: return '' @TryExcept() @WorkingDirectory(ROOT) def check_git_status(repo='ultralytics/yolov5', branch='master'): # YOLOv5 status check, recommend 'git pull' if code is out of date url = f'https://github.com/{repo}' msg = f', for updates see {url}' s = colorstr('github: ') # string assert Path('.git').exists(), s + 'skipping check (not a git repository)' + msg assert check_online(), s + 'skipping check (offline)' + msg splits = re.split(pattern=r'\s', string=check_output('git remote -v', shell=True).decode()) matches = [repo in s for s in splits] if any(matches): remote = splits[matches.index(True) - 1] else: remote = 'ultralytics' check_output(f'git remote add {remote} {url}', shell=True) check_output(f'git fetch {remote}', shell=True, timeout=5) # git fetch local_branch = check_output('git rev-parse --abbrev-ref HEAD', shell=True).decode().strip() # checked out n = int(check_output(f'git rev-list {local_branch}..{remote}/{branch} --count', shell=True)) # commits behind if n > 0: pull = 'git pull' if remote == 'origin' else f'git pull {remote} {branch}' s += f"⚠️ YOLOv5 is out of date by {n} commit{'s' * (n > 1)}. Use `{pull}` or `git clone {url}` to update." else: s += f'up to date with {url} ✅' LOGGER.info(s) @WorkingDirectory(ROOT) def check_git_info(path='.'): # YOLOv5 git info check, return {remote, branch, commit} check_requirements('gitpython') import git try: repo = git.Repo(path) remote = repo.remotes.origin.url.replace('.git', '') # i.e. 'https://github.com/ultralytics/yolov5' commit = repo.head.commit.hexsha # i.e. '3134699c73af83aac2a481435550b968d5792c0d' try: branch = repo.active_branch.name # i.e. 'main' except TypeError: # not on any branch branch = None # i.e. 'detached HEAD' state return {'remote': remote, 'branch': branch, 'commit': commit} except git.exc.InvalidGitRepositoryError: # path is not a git dir return {'remote': None, 'branch': None, 'commit': None} def check_python(minimum='3.7.0'): # Check current python version vs. required python version check_version(platform.python_version(), minimum, name='Python ', hard=True) def check_version(current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False, verbose=False): # Check version vs. required version current, minimum = (pkg.parse_version(x) for x in (current, minimum)) result = (current == minimum) if pinned else (current >= minimum) # bool s = f'WARNING ⚠️ {name}{minimum} is required by YOLOv5, but {name}{current} is currently installed' # string if hard: assert result, emojis(s) # assert min requirements met if verbose and not result: LOGGER.warning(s) return result @TryExcept() def check_requirements(requirements=ROOT / 'requirements.txt', exclude=(), install=True, cmds=''): # Check installed dependencies meet YOLOv5 requirements (pass *.txt file or list of packages or single package str) prefix = colorstr('red', 'bold', 'requirements:') check_python() # check python version if isinstance(requirements, Path): # requirements.txt file file = requirements.resolve() assert file.exists(), f'{prefix} {file} not found, check failed.' with file.open() as f: requirements = [f'{x.name}{x.specifier}' for x in pkg.parse_requirements(f) if x.name not in exclude] elif isinstance(requirements, str): requirements = [requirements] s = '' n = 0 for r in requirements: try: pkg.require(r) except (pkg.VersionConflict, pkg.DistributionNotFound): # exception if requirements not met s += f'"{r}" ' n += 1 if s and install and AUTOINSTALL: # check environment variable LOGGER.info(f"{prefix} YOLOv5 requirement{'s' * (n > 1)} {s}not found, attempting AutoUpdate...") try: # assert check_online(), "AutoUpdate skipped (offline)" LOGGER.info(check_output(f'pip install {s} {cmds}', shell=True).decode()) source = file if 'file' in locals() else requirements s = f"{prefix} {n} package{'s' * (n > 1)} updated per {source}\n" \ f"{prefix} ⚠️ {colorstr('bold', 'Restart runtime or rerun command for updates to take effect')}\n" LOGGER.info(s) except Exception as e: LOGGER.warning(f'{prefix} ❌ {e}') def check_img_size(imgsz, s=32, floor=0): # Verify image size is a multiple of stride s in each dimension if isinstance(imgsz, int): # integer i.e. img_size=640 new_size = max(make_divisible(imgsz, int(s)), floor) else: # list i.e. img_size=[640, 480] imgsz = list(imgsz) # convert to list if tuple new_size = [max(make_divisible(x, int(s)), floor) for x in imgsz] if new_size != imgsz: LOGGER.warning(f'WARNING ⚠️ --img-size {imgsz} must be multiple of max stride {s}, updating to {new_size}') return new_size def check_imshow(warn=False): # Check if environment supports image displays try: assert not is_notebook() assert not is_docker() cv2.imshow('test', np.zeros((1, 1, 3))) cv2.waitKey(1) cv2.destroyAllWindows() cv2.waitKey(1) return True except Exception as e: if warn: LOGGER.warning(f'WARNING ⚠️ Environment does not support cv2.imshow() or PIL Image.show()\n{e}') return False def check_suffix(file='yolov5s.pt', suffix=('.pt',), msg=''): # Check file(s) for acceptable suffix if file and suffix: if isinstance(suffix, str): suffix = [suffix] for f in file if isinstance(file, (list, tuple)) else [file]: s = Path(f).suffix.lower() # file suffix if len(s): assert s in suffix, f'{msg}{f} acceptable suffix is {suffix}' def check_yaml(file, suffix=('.yaml', '.yml')): # Search/download YAML file (if necessary) and return path, checking suffix return check_file(file, suffix) def check_file(file, suffix=''): # Search/download file (if necessary) and return path check_suffix(file, suffix) # optional file = str(file) # convert to str() if os.path.isfile(file) or not file: # exists return file elif file.startswith(('http:/', 'https:/')): # download url = file # warning: Pathlib turns :// -> :/ file = Path(urllib.parse.unquote(file).split('?')[0]).name # '%2F' to '/', split https://url.com/file.txt?auth if os.path.isfile(file): LOGGER.info(f'Found {url} locally at {file}') # file already exists else: LOGGER.info(f'Downloading {url} to {file}...') torch.hub.download_url_to_file(url, file) assert Path(file).exists() and Path(file).stat().st_size > 0, f'File download failed: {url}' # check return file elif file.startswith('clearml://'): # ClearML Dataset ID assert 'clearml' in sys.modules, "ClearML is not installed, so cannot use ClearML dataset. Try running 'pip install clearml'." return file else: # search files = [] for d in 'data', 'models', 'utils': # search directories files.extend(glob.glob(str(ROOT / d / '**' / file), recursive=True)) # find file assert len(files), f'File not found: {file}' # assert file was found assert len(files) == 1, f"Multiple files match '{file}', specify exact path: {files}" # assert unique return files[0] # return file def check_font(font=FONT, progress=False): # Download font to CONFIG_DIR if necessary font = Path(font) file = CONFIG_DIR / font.name if not font.exists() and not file.exists(): url = f'https://ultralytics.com/assets/{font.name}' LOGGER.info(f'Downloading {url} to {file}...') torch.hub.download_url_to_file(url, str(file), progress=progress) def check_dataset(data, autodownload=True): # Download, check and/or unzip dataset if not found locally # Download (optional) extract_dir = '' if isinstance(data, (str, Path)) and (is_zipfile(data) or is_tarfile(data)): download(data, dir=f'{DATASETS_DIR}/{Path(data).stem}', unzip=True, delete=False, curl=False, threads=1) data = next((DATASETS_DIR / Path(data).stem).rglob('*.yaml')) extract_dir, autodownload = data.parent, False # Read yaml (optional) if isinstance(data, (str, Path)): data = yaml_load(data) # dictionary # Checks for k in 'train', 'val', 'names': assert k in data, emojis(f"data.yaml '{k}:' field missing ❌") if isinstance(data['names'], (list, tuple)): # old array format data['names'] = dict(enumerate(data['names'])) # convert to dict assert all(isinstance(k, int) for k in data['names'].keys()), 'data.yaml names keys must be integers, i.e. 2: car' data['nc'] = len(data['names']) # Resolve paths path = Path(extract_dir or data.get('path') or '') # optional 'path' default to '.' if not path.is_absolute(): path = (ROOT / path).resolve() data['path'] = path # download scripts for k in 'train', 'val', 'test': if data.get(k): # prepend path if isinstance(data[k], str): x = (path / data[k]).resolve() if not x.exists() and data[k].startswith('../'): x = (path / data[k][3:]).resolve() data[k] = str(x) else: data[k] = [str((path / x).resolve()) for x in data[k]] # Parse yaml train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download')) if val: val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path if not all(x.exists() for x in val): LOGGER.info('\nDataset not found ⚠️, missing paths %s' % [str(x) for x in val if not x.exists()]) if not s or not autodownload: raise Exception('Dataset not found ❌') t = time.time() if s.startswith('http') and s.endswith('.zip'): # URL f = Path(s).name # filename LOGGER.info(f'Downloading {s} to {f}...') torch.hub.download_url_to_file(s, f) Path(DATASETS_DIR).mkdir(parents=True, exist_ok=True) # create root unzip_file(f, path=DATASETS_DIR) # unzip Path(f).unlink() # remove zip r = None # success elif s.startswith('bash '): # bash script LOGGER.info(f'Running {s} ...') r = subprocess.run(s, shell=True) else: # python script r = exec(s, {'yaml': data}) # return None dt = f'({round(time.time() - t, 1)}s)' s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f'failure {dt} ❌' LOGGER.info(f'Dataset download {s}') check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf', progress=True) # download fonts return data # dictionary def check_amp(model): # Check PyTorch Automatic Mixed Precision (AMP) functionality. Return True on correct operation from models.common import AutoShape, DetectMultiBackend def amp_allclose(model, im): # All close FP32 vs AMP results m = AutoShape(model, verbose=False) # model a = m(im).xywhn[0] # FP32 inference m.amp = True b = m(im).xywhn[0] # AMP inference return a.shape == b.shape and torch.allclose(a, b, atol=0.1) # close to 10% absolute tolerance prefix = colorstr('AMP: ') device = next(model.parameters()).device # get model device if device.type in ('cpu', 'mps'): return False # AMP only used on CUDA devices f = ROOT / 'data' / 'images' / 'bus.jpg' # image to check im = f if f.exists() else 'https://ultralytics.com/images/bus.jpg' if check_online() else np.ones((640, 640, 3)) try: assert amp_allclose(deepcopy(model), im) or amp_allclose(DetectMultiBackend('yolov5n.pt', device), im) LOGGER.info(f'{prefix}checks passed ✅') return True except Exception: help_url = 'https://github.com/ultralytics/yolov5/issues/7908' LOGGER.warning(f'{prefix}checks failed ❌, disabling Automatic Mixed Precision. See {help_url}') return False def yaml_load(file='data.yaml'): # Single-line safe yaml loading with open(file, errors='ignore') as f: return yaml.safe_load(f) def yaml_save(file='data.yaml', data={}): # Single-line safe yaml saving with open(file, 'w') as f: yaml.safe_dump({k: str(v) if isinstance(v, Path) else v for k, v in data.items()}, f, sort_keys=False) def unzip_file(file, path=None, exclude=('.DS_Store', '__MACOSX')): # Unzip a *.zip file to path/, excluding files containing strings in exclude list if path is None: path = Path(file).parent # default path with ZipFile(file) as zipObj: for f in zipObj.namelist(): # list all archived filenames in the zip if all(x not in f for x in exclude): zipObj.extract(f, path=path) def url2file(url): # Convert URL to filename, i.e. https://url.com/file.txt?auth -> file.txt url = str(Path(url)).replace(':/', '://') # Pathlib turns :// -> :/ return Path(urllib.parse.unquote(url)).name.split('?')[0] # '%2F' to '/', split https://url.com/file.txt?auth def download(url, dir='.', unzip=True, delete=True, curl=False, threads=1, retry=3): # Multithreaded file download and unzip function, used in data.yaml for autodownload def download_one(url, dir): # Download 1 file success = True if os.path.isfile(url): f = Path(url) # filename else: # does not exist f = dir / Path(url).name LOGGER.info(f'Downloading {url} to {f}...') for i in range(retry + 1): if curl: success = curl_download(url, f, silent=(threads > 1)) else: torch.hub.download_url_to_file(url, f, progress=threads == 1) # torch download success = f.is_file() if success: break elif i < retry: LOGGER.warning(f'⚠️ Download failure, retrying {i + 1}/{retry} {url}...') else: LOGGER.warning(f'❌ Failed to download {url}...') if unzip and success and (f.suffix == '.gz' or is_zipfile(f) or is_tarfile(f)): LOGGER.info(f'Unzipping {f}...') if is_zipfile(f): unzip_file(f, dir) # unzip elif is_tarfile(f): subprocess.run(['tar', 'xf', f, '--directory', f.parent], check=True) # unzip elif f.suffix == '.gz': subprocess.run(['tar', 'xfz', f, '--directory', f.parent], check=True) # unzip if delete: f.unlink() # remove zip dir = Path(dir) dir.mkdir(parents=True, exist_ok=True) # make directory if threads > 1: pool = ThreadPool(threads) pool.imap(lambda x: download_one(*x), zip(url, repeat(dir))) # multithreaded pool.close() pool.join() else: for u in [url] if isinstance(url, (str, Path)) else url: download_one(u, dir) def make_divisible(x, divisor): # Returns nearest x divisible by divisor if isinstance(divisor, torch.Tensor): divisor = int(divisor.max()) # to int return math.ceil(x / divisor) * divisor def clean_str(s): # Cleans a string by replacing special characters with underscore _ return re.sub(pattern='[|@#!¡·$€%&()=?¿^*;:,¨´><+]', repl='_', string=s) def one_cycle(y1=0.0, y2=1.0, steps=100): # lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1 def colorstr(*input): # Colors a string https://en.wikipedia.org/wiki/ANSI_escape_code, i.e. colorstr('blue', 'hello world') *args, string = input if len(input) > 1 else ('blue', 'bold', input[0]) # color arguments, string colors = { 'black': '\033[30m', # basic colors 'red': '\033[31m', 'green': '\033[32m', 'yellow': '\033[33m', 'blue': '\033[34m', 'magenta': '\033[35m', 'cyan': '\033[36m', 'white': '\033[37m', 'bright_black': '\033[90m', # bright colors 'bright_red': '\033[91m', 'bright_green': '\033[92m', 'bright_yellow': '\033[93m', 'bright_blue': '\033[94m', 'bright_magenta': '\033[95m', 'bright_cyan': '\033[96m', 'bright_white': '\033[97m', 'end': '\033[0m', # misc 'bold': '\033[1m', 'underline': '\033[4m'} return ''.join(colors[x] for x in args) + f'{string}' + colors['end'] def labels_to_class_weights(labels, nc=80): # Get class weights (inverse frequency) from training labels if labels[0] is None: # no labels loaded return torch.Tensor() labels = np.concatenate(labels, 0) # labels.shape = (866643, 5) for COCO classes = labels[:, 0].astype(int) # labels = [class xywh] weights = np.bincount(classes, minlength=nc) # occurrences per class # Prepend gridpoint count (for uCE training) # gpi = ((320 / 32 * np.array([1, 2, 4])) ** 2 * 3).sum() # gridpoints per image # weights = np.hstack([gpi * len(labels) - weights.sum() * 9, weights * 9]) ** 0.5 # prepend gridpoints to start weights[weights == 0] = 1 # replace empty bins with 1 weights = 1 / weights # number of targets per class weights /= weights.sum() # normalize return torch.from_numpy(weights).float() def labels_to_image_weights(labels, nc=80, class_weights=np.ones(80)): # Produces image weights based on class_weights and image contents # Usage: index = random.choices(range(n), weights=image_weights, k=1) # weighted image sample class_counts = np.array([np.bincount(x[:, 0].astype(int), minlength=nc) for x in labels]) return (class_weights.reshape(1, nc) * class_counts).sum(1) def coco80_to_coco91_class(): # converts 80-index (val2014) to 91-index (paper) # https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/ # a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n') # b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n') # x1 = [list(a[i] == b).index(True) + 1 for i in range(80)] # darknet to coco # x2 = [list(b[i] == a).index(True) if any(b[i] == a) else None for i in range(91)] # coco to darknet return [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] def xyxy2xywh(x): # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center y[..., 2] = x[..., 2] - x[..., 0] # width y[..., 3] = x[..., 3] - x[..., 1] # height return y def xywh2xyxy(x): # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y y[..., 2] = x[..., 0] + x[..., 2] / 2 # bottom right x y[..., 3] = x[..., 1] + x[..., 3] / 2 # bottom right y return y def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0): # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw # top left x y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh # top left y y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw # bottom right x y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh # bottom right y return y def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0): # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right if clip: clip_boxes(x, (h - eps, w - eps)) # warning: inplace clip y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w # x center y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h # y center y[..., 2] = (x[..., 2] - x[..., 0]) / w # width y[..., 3] = (x[..., 3] - x[..., 1]) / h # height return y def xyn2xy(x, w=640, h=640, padw=0, padh=0): # Convert normalized segments into pixel segments, shape (n,2) y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) y[..., 0] = w * x[..., 0] + padw # top left x y[..., 1] = h * x[..., 1] + padh # top left y return y def segment2box(segment, width=640, height=640): # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy) x, y = segment.T # segment xy inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) x, y, = x[inside], y[inside] return np.array([x.min(), y.min(), x.max(), y.max()]) if any(x) else np.zeros((1, 4)) # xyxy def segments2boxes(segments): # Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh) boxes = [] for s in segments: x, y = s.T # segment xy boxes.append([x.min(), y.min(), x.max(), y.max()]) # cls, xyxy return xyxy2xywh(np.array(boxes)) # cls, xywh def resample_segments(segments, n=1000): # Up-sample an (n,2) segment for i, s in enumerate(segments): s = np.concatenate((s, s[0:1, :]), axis=0) x = np.linspace(0, len(s) - 1, n) xp = np.arange(len(s)) segments[i] = np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)]).reshape(2, -1).T # segment xy return segments def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None): # Rescale boxes (xyxy) from img1_shape to img0_shape if ratio_pad is None: # calculate from img0_shape gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding else: gain = ratio_pad[0][0] pad = ratio_pad[1] boxes[..., [0, 2]] -= pad[0] # x padding boxes[..., [1, 3]] -= pad[1] # y padding boxes[..., :4] /= gain clip_boxes(boxes, img0_shape) return boxes def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None, normalize=False): # Rescale coords (xyxy) from img1_shape to img0_shape if ratio_pad is None: # calculate from img0_shape gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding else: gain = ratio_pad[0][0] pad = ratio_pad[1] segments[:, 0] -= pad[0] # x padding segments[:, 1] -= pad[1] # y padding segments /= gain clip_segments(segments, img0_shape) if normalize: segments[:, 0] /= img0_shape[1] # width segments[:, 1] /= img0_shape[0] # height return segments def clip_boxes(boxes, shape): # Clip boxes (xyxy) to image shape (height, width) if isinstance(boxes, torch.Tensor): # faster individually boxes[..., 0].clamp_(0, shape[1]) # x1 boxes[..., 1].clamp_(0, shape[0]) # y1 boxes[..., 2].clamp_(0, shape[1]) # x2 boxes[..., 3].clamp_(0, shape[0]) # y2 else: # np.array (faster grouped) boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 def clip_segments(segments, shape): # Clip segments (xy1,xy2,...) to image shape (height, width) if isinstance(segments, torch.Tensor): # faster individually segments[:, 0].clamp_(0, shape[1]) # x segments[:, 1].clamp_(0, shape[0]) # y else: # np.array (faster grouped) segments[:, 0] = segments[:, 0].clip(0, shape[1]) # x segments[:, 1] = segments[:, 1].clip(0, shape[0]) # y def non_max_suppression( prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=(), max_det=300, nm=0, # number of masks ): """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections Returns: list of detections, on (n,6) tensor per image [xyxy, conf, cls] """ # Checks assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0' assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0' if isinstance(prediction, (list, tuple)): # YOLOv5 model in validation model, output = (inference_out, loss_out) prediction = prediction[0] # select only inference output device = prediction.device mps = 'mps' in device.type # Apple MPS if mps: # MPS not fully supported yet, convert tensors to CPU before NMS prediction = prediction.cpu() bs = prediction.shape[0] # batch size nc = prediction.shape[2] - nm - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates # Settings # min_wh = 2 # (pixels) minimum box width and height max_wh = 7680 # (pixels) maximum box width and height max_nms = 30000 # maximum number of boxes into torchvision.ops.nms() time_limit = 0.5 + 0.05 * bs # seconds to quit after redundant = True # require redundant detections multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img) merge = False # use merge-NMS t = time.time() mi = 5 + nc # mask start index output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs for xi, x in enumerate(prediction): # image index, image inference # Apply constraints # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height x = x[xc[xi]] # confidence # Cat apriori labels if autolabelling if labels and len(labels[xi]): lb = labels[xi] v = torch.zeros((len(lb), nc + nm + 5), device=x.device) v[:, :4] = lb[:, 1:5] # box v[:, 4] = 1.0 # conf v[range(len(lb)), lb[:, 0].long() + 5] = 1.0 # cls x = torch.cat((x, v), 0) # If none remain process next image if not x.shape[0]: continue # Compute conf x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf # Box/Mask box = xywh2xyxy(x[:, :4]) # center_x, center_y, width, height) to (x1, y1, x2, y2) mask = x[:, mi:] # zero columns if no masks # Detections matrix nx6 (xyxy, conf, cls) if multi_label: i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1) else: # best class only conf, j = x[:, 5:mi].max(1, keepdim=True) x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres] # Filter by class if classes is not None: x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] # Apply finite constraint # if not torch.isfinite(x).all(): # x = x[torch.isfinite(x).all(1)] # Check shape n = x.shape[0] # number of boxes if not n: # no boxes continue x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence and remove excess boxes # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS i = i[:max_det] # limit detections if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean) # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix weights = iou * scores[None] # box weights x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes if redundant: i = i[iou.sum(1) > 1] # require redundancy output[xi] = x[i] if mps: output[xi] = output[xi].to(device) if (time.time() - t) > time_limit: LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded') break # time limit exceeded return output def strip_optimizer(f='best.pt', s=''): # from utils.general import *; strip_optimizer() # Strip optimizer from 'f' to finalize training, optionally save as 's' x = torch.load(f, map_location=torch.device('cpu')) if x.get('ema'): x['model'] = x['ema'] # replace model with ema for k in 'optimizer', 'best_fitness', 'ema', 'updates': # keys x[k] = None x['epoch'] = -1 x['model'].half() # to FP16 for p in x['model'].parameters(): p.requires_grad = False torch.save(x, s or f) mb = os.path.getsize(s or f) / 1E6 # filesize LOGGER.info(f"Optimizer stripped from {f},{f' saved as {s},' if s else ''} {mb:.1f}MB") def print_mutation(keys, results, hyp, save_dir, bucket, prefix=colorstr('evolve: ')): evolve_csv = save_dir / 'evolve.csv' evolve_yaml = save_dir / 'hyp_evolve.yaml' keys = tuple(keys) + tuple(hyp.keys()) # [results + hyps] keys = tuple(x.strip() for x in keys) vals = results + tuple(hyp.values()) n = len(keys) # Download (optional) if bucket: url = f'gs://{bucket}/evolve.csv' if gsutil_getsize(url) > (evolve_csv.stat().st_size if evolve_csv.exists() else 0): subprocess.run(['gsutil', 'cp', f'{url}', f'{save_dir}']) # download evolve.csv if larger than local # Log to evolve.csv s = '' if evolve_csv.exists() else (('%20s,' * n % keys).rstrip(',') + '\n') # add header with open(evolve_csv, 'a') as f: f.write(s + ('%20.5g,' * n % vals).rstrip(',') + '\n') # Save yaml with open(evolve_yaml, 'w') as f: data = pd.read_csv(evolve_csv, skipinitialspace=True) data = data.rename(columns=lambda x: x.strip()) # strip keys i = np.argmax(fitness(data.values[:, :4])) # generations = len(data) f.write('# YOLOv5 Hyperparameter Evolution Results\n' + f'# Best generation: {i}\n' + f'# Last generation: {generations - 1}\n' + '# ' + ', '.join(f'{x.strip():>20s}' for x in keys[:7]) + '\n' + '# ' + ', '.join(f'{x:>20.5g}' for x in data.values[i, :7]) + '\n\n') yaml.safe_dump(data.loc[i][7:].to_dict(), f, sort_keys=False) # Print to screen LOGGER.info(prefix + f'{generations} generations finished, current result:\n' + prefix + ', '.join(f'{x.strip():>20s}' for x in keys) + '\n' + prefix + ', '.join(f'{x:20.5g}' for x in vals) + '\n\n') if bucket: subprocess.run(['gsutil', 'cp', f'{evolve_csv}', f'{evolve_yaml}', f'gs://{bucket}']) # upload def apply_classifier(x, model, img, im0): # Apply a second stage classifier to YOLO outputs # Example model = torchvision.models.__dict__['efficientnet_b0'](pretrained=True).to(device).eval() im0 = [im0] if isinstance(im0, np.ndarray) else im0 for i, d in enumerate(x): # per image if d is not None and len(d): d = d.clone() # Reshape and pad cutouts b = xyxy2xywh(d[:, :4]) # boxes b[:, 2:] = b[:, 2:].max(1)[0].unsqueeze(1) # rectangle to square b[:, 2:] = b[:, 2:] * 1.3 + 30 # pad d[:, :4] = xywh2xyxy(b).long() # Rescale boxes from img_size to im0 size scale_boxes(img.shape[2:], d[:, :4], im0[i].shape) # Classes pred_cls1 = d[:, 5].long() ims = [] for a in d: cutout = im0[i][int(a[1]):int(a[3]), int(a[0]):int(a[2])] im = cv2.resize(cutout, (224, 224)) # BGR im = im[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 im = np.ascontiguousarray(im, dtype=np.float32) # uint8 to float32 im /= 255 # 0 - 255 to 0.0 - 1.0 ims.append(im) pred_cls2 = model(torch.Tensor(ims).to(d.device)).argmax(1) # classifier prediction x[i] = x[i][pred_cls1 == pred_cls2] # retain matching class detections return x def increment_path(path, exist_ok=False, sep='', mkdir=False): # Increment file or directory path, i.e. runs/exp --> runs/exp{sep}2, runs/exp{sep}3, ... etc. path = Path(path) # os-agnostic if path.exists() and not exist_ok: path, suffix = (path.with_suffix(''), path.suffix) if path.is_file() else (path, '') # Method 1 for n in range(2, 9999): p = f'{path}{sep}{n}{suffix}' # increment path if not os.path.exists(p): # break path = Path(p) # Method 2 (deprecated) # dirs = glob.glob(f"{path}{sep}*") # similar paths # matches = [re.search(rf"{path.stem}{sep}(\d+)", d) for d in dirs] # i = [int(m.groups()[0]) for m in matches if m] # indices # n = max(i) + 1 if i else 2 # increment number # path = Path(f"{path}{sep}{n}{suffix}") # increment path if mkdir: path.mkdir(parents=True, exist_ok=True) # make directory return path # OpenCV Multilanguage-friendly functions ------------------------------------------------------------------------------------ imshow_ = cv2.imshow # copy to avoid recursion errors def imread(path, flags=cv2.IMREAD_COLOR): return cv2.imdecode(np.fromfile(path, np.uint8), flags) def imwrite(path, im): try: cv2.imencode(Path(path).suffix, im)[1].tofile(path) return True except Exception: return False def imshow(path, im): imshow_(path.encode('unicode_escape').decode(), im) cv2.imread, cv2.imwrite, cv2.imshow = imread, imwrite, imshow # redefine # Variables ------------------------------------------------------------------------------------------------------------ ================================================ FILE: yolo-improve/yolov5-AUX/utils/google_app_engine/Dockerfile ================================================ FROM gcr.io/google-appengine/python # Create a virtualenv for dependencies. This isolates these packages from # system-level packages. # Use -p python3 or -p python3.7 to select python version. Default is version 2. RUN virtualenv /env -p python3 # Setting these environment variables are the same as running # source /env/bin/activate. ENV VIRTUAL_ENV /env ENV PATH /env/bin:$PATH RUN apt-get update && apt-get install -y python-opencv # Copy the application's requirements.txt and run pip to install all # dependencies into the virtualenv. ADD requirements.txt /app/requirements.txt RUN pip install -r /app/requirements.txt # Add the application source code. ADD . /app # Run a WSGI server to serve the application. gunicorn must be declared as # a dependency in requirements.txt. CMD gunicorn -b :$PORT main:app ================================================ FILE: yolo-improve/yolov5-AUX/utils/google_app_engine/additional_requirements.txt ================================================ # add these requirements in your app on top of the existing ones pip==21.1 Flask==1.0.2 gunicorn==19.10.0 werkzeug>=2.2.3 # not directly required, pinned by Snyk to avoid a vulnerability ================================================ FILE: yolo-improve/yolov5-AUX/utils/google_app_engine/app.yaml ================================================ runtime: custom env: flex service: yolov5app liveness_check: initial_delay_sec: 600 manual_scaling: instances: 1 resources: cpu: 1 memory_gb: 4 disk_size_gb: 20 ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/__init__.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Logging utils """ import os import warnings from pathlib import Path import pkg_resources as pkg import torch from torch.utils.tensorboard import SummaryWriter from utils.general import LOGGER, colorstr, cv2 from utils.loggers.clearml.clearml_utils import ClearmlLogger from utils.loggers.wandb.wandb_utils import WandbLogger from utils.plots import plot_images, plot_labels, plot_results from utils.torch_utils import de_parallel LOGGERS = ('csv', 'tb', 'wandb', 'clearml', 'comet') # *.csv, TensorBoard, Weights & Biases, ClearML RANK = int(os.getenv('RANK', -1)) try: import wandb assert hasattr(wandb, '__version__') # verify package import not local dir if pkg.parse_version(wandb.__version__) >= pkg.parse_version('0.12.2') and RANK in {0, -1}: try: wandb_login_success = wandb.login(timeout=30) except wandb.errors.UsageError: # known non-TTY terminal issue wandb_login_success = False if not wandb_login_success: wandb = None except (ImportError, AssertionError): wandb = None try: import clearml assert hasattr(clearml, '__version__') # verify package import not local dir except (ImportError, AssertionError): clearml = None try: if RANK not in [0, -1]: comet_ml = None else: import comet_ml assert hasattr(comet_ml, '__version__') # verify package import not local dir from utils.loggers.comet import CometLogger except (ModuleNotFoundError, ImportError, AssertionError): comet_ml = None class Loggers(): # YOLOv5 Loggers class def __init__(self, save_dir=None, weights=None, opt=None, hyp=None, logger=None, include=LOGGERS): self.save_dir = save_dir self.weights = weights self.opt = opt self.hyp = hyp self.plots = not opt.noplots # plot results self.logger = logger # for printing results to console self.include = include self.keys = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', # metrics 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params self.best_keys = ['best/epoch', 'best/precision', 'best/recall', 'best/mAP_0.5', 'best/mAP_0.5:0.95'] for k in LOGGERS: setattr(self, k, None) # init empty logger dictionary self.csv = True # always log to csv # Messages if not clearml: prefix = colorstr('ClearML: ') s = f"{prefix}run 'pip install clearml' to automatically track, visualize and remotely train YOLOv5 🚀 in ClearML" self.logger.info(s) if not comet_ml: prefix = colorstr('Comet: ') s = f"{prefix}run 'pip install comet_ml' to automatically track and visualize YOLOv5 🚀 runs in Comet" self.logger.info(s) # TensorBoard s = self.save_dir if 'tb' in self.include and not self.opt.evolve: prefix = colorstr('TensorBoard: ') self.logger.info(f"{prefix}Start with 'tensorboard --logdir {s.parent}', view at http://localhost:6006/") self.tb = SummaryWriter(str(s)) # W&B if wandb and 'wandb' in self.include: self.opt.hyp = self.hyp # add hyperparameters self.wandb = WandbLogger(self.opt) else: self.wandb = None # ClearML if clearml and 'clearml' in self.include: try: self.clearml = ClearmlLogger(self.opt, self.hyp) except Exception: self.clearml = None prefix = colorstr('ClearML: ') LOGGER.warning(f'{prefix}WARNING ⚠️ ClearML is installed but not configured, skipping ClearML logging.' f' See https://github.com/ultralytics/yolov5/tree/master/utils/loggers/clearml#readme') else: self.clearml = None # Comet if comet_ml and 'comet' in self.include: if isinstance(self.opt.resume, str) and self.opt.resume.startswith('comet://'): run_id = self.opt.resume.split('/')[-1] self.comet_logger = CometLogger(self.opt, self.hyp, run_id=run_id) else: self.comet_logger = CometLogger(self.opt, self.hyp) else: self.comet_logger = None @property def remote_dataset(self): # Get data_dict if custom dataset artifact link is provided data_dict = None if self.clearml: data_dict = self.clearml.data_dict if self.wandb: data_dict = self.wandb.data_dict if self.comet_logger: data_dict = self.comet_logger.data_dict return data_dict def on_train_start(self): if self.comet_logger: self.comet_logger.on_train_start() def on_pretrain_routine_start(self): if self.comet_logger: self.comet_logger.on_pretrain_routine_start() def on_pretrain_routine_end(self, labels, names): # Callback runs on pre-train routine end if self.plots: plot_labels(labels, names, self.save_dir) paths = self.save_dir.glob('*labels*.jpg') # training labels if self.wandb: self.wandb.log({'Labels': [wandb.Image(str(x), caption=x.name) for x in paths]}) # if self.clearml: # pass # ClearML saves these images automatically using hooks if self.comet_logger: self.comet_logger.on_pretrain_routine_end(paths) def on_train_batch_end(self, model, ni, imgs, targets, paths, vals): log_dict = dict(zip(self.keys[:3], vals)) # Callback runs on train batch end # ni: number integrated batches (since train start) if self.plots: if ni < 3: f = self.save_dir / f'train_batch{ni}.jpg' # filename plot_images(imgs, targets, paths, f) if ni == 0 and self.tb and not self.opt.sync_bn: log_tensorboard_graph(self.tb, model, imgsz=(self.opt.imgsz, self.opt.imgsz)) if ni == 10 and (self.wandb or self.clearml): files = sorted(self.save_dir.glob('train*.jpg')) if self.wandb: self.wandb.log({'Mosaics': [wandb.Image(str(f), caption=f.name) for f in files if f.exists()]}) if self.clearml: self.clearml.log_debug_samples(files, title='Mosaics') if self.comet_logger: self.comet_logger.on_train_batch_end(log_dict, step=ni) def on_train_epoch_end(self, epoch): # Callback runs on train epoch end if self.wandb: self.wandb.current_epoch = epoch + 1 if self.comet_logger: self.comet_logger.on_train_epoch_end(epoch) def on_val_start(self): if self.comet_logger: self.comet_logger.on_val_start() def on_val_image_end(self, pred, predn, path, names, im): # Callback runs on val image end if self.wandb: self.wandb.val_one_image(pred, predn, path, names, im) if self.clearml: self.clearml.log_image_with_boxes(path, pred, names, im) def on_val_batch_end(self, batch_i, im, targets, paths, shapes, out): if self.comet_logger: self.comet_logger.on_val_batch_end(batch_i, im, targets, paths, shapes, out) def on_val_end(self, nt, tp, fp, p, r, f1, ap, ap50, ap_class, confusion_matrix): # Callback runs on val end if self.wandb or self.clearml: files = sorted(self.save_dir.glob('val*.jpg')) if self.wandb: self.wandb.log({'Validation': [wandb.Image(str(f), caption=f.name) for f in files]}) if self.clearml: self.clearml.log_debug_samples(files, title='Validation') if self.comet_logger: self.comet_logger.on_val_end(nt, tp, fp, p, r, f1, ap, ap50, ap_class, confusion_matrix) def on_fit_epoch_end(self, vals, epoch, best_fitness, fi): # Callback runs at the end of each fit (train+val) epoch x = dict(zip(self.keys, vals)) if self.csv: file = self.save_dir / 'results.csv' n = len(x) + 1 # number of cols s = '' if file.exists() else (('%20s,' * n % tuple(['epoch'] + self.keys)).rstrip(',') + '\n') # add header with open(file, 'a') as f: f.write(s + ('%20.5g,' * n % tuple([epoch] + vals)).rstrip(',') + '\n') if self.tb: for k, v in x.items(): self.tb.add_scalar(k, v, epoch) elif self.clearml: # log to ClearML if TensorBoard not used for k, v in x.items(): title, series = k.split('/') self.clearml.task.get_logger().report_scalar(title, series, v, epoch) if self.wandb: if best_fitness == fi: best_results = [epoch] + vals[3:7] for i, name in enumerate(self.best_keys): self.wandb.wandb_run.summary[name] = best_results[i] # log best results in the summary self.wandb.log(x) self.wandb.end_epoch() if self.clearml: self.clearml.current_epoch_logged_images = set() # reset epoch image limit self.clearml.current_epoch += 1 if self.comet_logger: self.comet_logger.on_fit_epoch_end(x, epoch=epoch) def on_model_save(self, last, epoch, final_epoch, best_fitness, fi): # Callback runs on model save event if (epoch + 1) % self.opt.save_period == 0 and not final_epoch and self.opt.save_period != -1: if self.wandb: self.wandb.log_model(last.parent, self.opt, epoch, fi, best_model=best_fitness == fi) if self.clearml: self.clearml.task.update_output_model(model_path=str(last), model_name='Latest Model', auto_delete_file=False) if self.comet_logger: self.comet_logger.on_model_save(last, epoch, final_epoch, best_fitness, fi) def on_train_end(self, last, best, epoch, results): # Callback runs on training end, i.e. saving best model if self.plots: plot_results(file=self.save_dir / 'results.csv') # save results.png files = ['results.png', 'confusion_matrix.png', *(f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R'))] files = [(self.save_dir / f) for f in files if (self.save_dir / f).exists()] # filter self.logger.info(f"Results saved to {colorstr('bold', self.save_dir)}") if self.tb and not self.clearml: # These images are already captured by ClearML by now, we don't want doubles for f in files: self.tb.add_image(f.stem, cv2.imread(str(f))[..., ::-1], epoch, dataformats='HWC') if self.wandb: self.wandb.log(dict(zip(self.keys[3:10], results))) self.wandb.log({'Results': [wandb.Image(str(f), caption=f.name) for f in files]}) # Calling wandb.log. TODO: Refactor this into WandbLogger.log_model if not self.opt.evolve: wandb.log_artifact(str(best if best.exists() else last), type='model', name=f'run_{self.wandb.wandb_run.id}_model', aliases=['latest', 'best', 'stripped']) self.wandb.finish_run() if self.clearml and not self.opt.evolve: self.clearml.task.update_output_model(model_path=str(best if best.exists() else last), name='Best Model', auto_delete_file=False) if self.comet_logger: final_results = dict(zip(self.keys[3:10], results)) self.comet_logger.on_train_end(files, self.save_dir, last, best, epoch, final_results) def on_params_update(self, params: dict): # Update hyperparams or configs of the experiment if self.wandb: self.wandb.wandb_run.config.update(params, allow_val_change=True) if self.comet_logger: self.comet_logger.on_params_update(params) class GenericLogger: """ YOLOv5 General purpose logger for non-task specific logging Usage: from utils.loggers import GenericLogger; logger = GenericLogger(...) Arguments opt: Run arguments console_logger: Console logger include: loggers to include """ def __init__(self, opt, console_logger, include=('tb', 'wandb')): # init default loggers self.save_dir = Path(opt.save_dir) self.include = include self.console_logger = console_logger self.csv = self.save_dir / 'results.csv' # CSV logger if 'tb' in self.include: prefix = colorstr('TensorBoard: ') self.console_logger.info( f"{prefix}Start with 'tensorboard --logdir {self.save_dir.parent}', view at http://localhost:6006/") self.tb = SummaryWriter(str(self.save_dir)) if wandb and 'wandb' in self.include: self.wandb = wandb.init(project=web_project_name(str(opt.project)), name=None if opt.name == 'exp' else opt.name, config=opt) else: self.wandb = None def log_metrics(self, metrics, epoch): # Log metrics dictionary to all loggers if self.csv: keys, vals = list(metrics.keys()), list(metrics.values()) n = len(metrics) + 1 # number of cols s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header with open(self.csv, 'a') as f: f.write(s + ('%23.5g,' * n % tuple([epoch] + vals)).rstrip(',') + '\n') if self.tb: for k, v in metrics.items(): self.tb.add_scalar(k, v, epoch) if self.wandb: self.wandb.log(metrics, step=epoch) def log_images(self, files, name='Images', epoch=0): # Log images to all loggers files = [Path(f) for f in (files if isinstance(files, (tuple, list)) else [files])] # to Path files = [f for f in files if f.exists()] # filter by exists if self.tb: for f in files: self.tb.add_image(f.stem, cv2.imread(str(f))[..., ::-1], epoch, dataformats='HWC') if self.wandb: self.wandb.log({name: [wandb.Image(str(f), caption=f.name) for f in files]}, step=epoch) def log_graph(self, model, imgsz=(640, 640)): # Log model graph to all loggers if self.tb: log_tensorboard_graph(self.tb, model, imgsz) def log_model(self, model_path, epoch=0, metadata={}): # Log model to all loggers if self.wandb: art = wandb.Artifact(name=f'run_{wandb.run.id}_model', type='model', metadata=metadata) art.add_file(str(model_path)) wandb.log_artifact(art) def update_params(self, params): # Update the parameters logged if self.wandb: wandb.run.config.update(params, allow_val_change=True) def log_tensorboard_graph(tb, model, imgsz=(640, 640)): # Log model graph to TensorBoard try: p = next(model.parameters()) # for device, type imgsz = (imgsz, imgsz) if isinstance(imgsz, int) else imgsz # expand im = torch.zeros((1, 3, *imgsz)).to(p.device).type_as(p) # input image (WARNING: must be zeros, not empty) with warnings.catch_warnings(): warnings.simplefilter('ignore') # suppress jit trace warning tb.add_graph(torch.jit.trace(de_parallel(model), im, strict=False), []) except Exception as e: LOGGER.warning(f'WARNING ⚠️ TensorBoard graph visualization failure {e}') def web_project_name(project): # Convert local project name to web project name if not project.startswith('runs/train'): return project suffix = '-Classify' if project.endswith('-cls') else '-Segment' if project.endswith('-seg') else '' return f'YOLOv5{suffix}' ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/clearml/README.md ================================================ # ClearML Integration Clear|MLClear|ML ## About ClearML [ClearML](https://cutt.ly/yolov5-tutorial-clearml) is an [open-source](https://github.com/allegroai/clearml) toolbox designed to save you time ⏱️. 🔨 Track every YOLOv5 training run in the experiment manager 🔧 Version and easily access your custom training data with the integrated ClearML Data Versioning Tool 🔦 Remotely train and monitor your YOLOv5 training runs using ClearML Agent 🔬 Get the very best mAP using ClearML Hyperparameter Optimization 🔭 Turn your newly trained YOLOv5 model into an API with just a few commands using ClearML Serving
And so much more. It's up to you how many of these tools you want to use, you can stick to the experiment manager, or chain them all together into an impressive pipeline!

![ClearML scalars dashboard](https://github.com/thepycoder/clearml_screenshots/raw/main/experiment_manager_with_compare.gif)

## 🦾 Setting Things Up To keep track of your experiments and/or data, ClearML needs to communicate to a server. You have 2 options to get one: Either sign up for free to the [ClearML Hosted Service](https://cutt.ly/yolov5-tutorial-clearml) or you can set up your own server, see [here](https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server). Even the server is open-source, so even if you're dealing with sensitive data, you should be good to go! 1. Install the `clearml` python package: ```bash pip install clearml ``` 1. Connect the ClearML SDK to the server by [creating credentials](https://app.clear.ml/settings/workspace-configuration) (go right top to Settings -> Workspace -> Create new credentials), then execute the command below and follow the instructions: ```bash clearml-init ``` That's it! You're done 😎
## 🚀 Training YOLOv5 With ClearML To enable ClearML experiment tracking, simply install the ClearML pip package. ```bash pip install clearml>=1.2.0 ``` This will enable integration with the YOLOv5 training script. Every training run from now on, will be captured and stored by the ClearML experiment manager. If you want to change the `project_name` or `task_name`, use the `--project` and `--name` arguments of the `train.py` script, by default the project will be called `YOLOv5` and the task `Training`. PLEASE NOTE: ClearML uses `/` as a delimiter for subprojects, so be careful when using `/` in your project name! ```bash python train.py --img 640 --batch 16 --epochs 3 --data coco128.yaml --weights yolov5s.pt --cache ``` or with custom project and task name: ```bash python train.py --project my_project --name my_training --img 640 --batch 16 --epochs 3 --data coco128.yaml --weights yolov5s.pt --cache ``` This will capture: - Source code + uncommitted changes - Installed packages - (Hyper)parameters - Model files (use `--save-period n` to save a checkpoint every n epochs) - Console output - Scalars (mAP_0.5, mAP_0.5:0.95, precision, recall, losses, learning rates, ...) - General info such as machine details, runtime, creation date etc. - All produced plots such as label correlogram and confusion matrix - Images with bounding boxes per epoch - Mosaic per epoch - Validation images per epoch - ... That's a lot right? 🤯 Now, we can visualize all of this information in the ClearML UI to get an overview of our training progress. Add custom columns to the table view (such as e.g. mAP_0.5) so you can easily sort on the best performing model. Or select multiple experiments and directly compare them! There even more we can do with all of this information, like hyperparameter optimization and remote execution, so keep reading if you want to see how that works!
## 🔗 Dataset Version Management Versioning your data separately from your code is generally a good idea and makes it easy to acquire the latest version too. This repository supports supplying a dataset version ID, and it will make sure to get the data if it's not there yet. Next to that, this workflow also saves the used dataset ID as part of the task parameters, so you will always know for sure which data was used in which experiment! ![ClearML Dataset Interface](https://github.com/thepycoder/clearml_screenshots/raw/main/clearml_data.gif) ### Prepare Your Dataset The YOLOv5 repository supports a number of different datasets by using yaml files containing their information. By default datasets are downloaded to the `../datasets` folder in relation to the repository root folder. So if you downloaded the `coco128` dataset using the link in the yaml or with the scripts provided by yolov5, you get this folder structure: ``` .. |_ yolov5 |_ datasets |_ coco128 |_ images |_ labels |_ LICENSE |_ README.txt ``` But this can be any dataset you wish. Feel free to use your own, as long as you keep to this folder structure. Next, ⚠️**copy the corresponding yaml file to the root of the dataset folder**⚠️. This yaml files contains the information ClearML will need to properly use the dataset. You can make this yourself too, of course, just follow the structure of the example yamls. Basically we need the following keys: `path`, `train`, `test`, `val`, `nc`, `names`. ``` .. |_ yolov5 |_ datasets |_ coco128 |_ images |_ labels |_ coco128.yaml # <---- HERE! |_ LICENSE |_ README.txt ``` ### Upload Your Dataset To get this dataset into ClearML as a versioned dataset, go to the dataset root folder and run the following command: ```bash cd coco128 clearml-data sync --project YOLOv5 --name coco128 --folder . ``` The command `clearml-data sync` is actually a shorthand command. You could also run these commands one after the other: ```bash # Optionally add --parent if you want to base # this version on another dataset version, so no duplicate files are uploaded! clearml-data create --name coco128 --project YOLOv5 clearml-data add --files . clearml-data close ``` ### Run Training Using A ClearML Dataset Now that you have a ClearML dataset, you can very simply use it to train custom YOLOv5 🚀 models! ```bash python train.py --img 640 --batch 16 --epochs 3 --data clearml:// --weights yolov5s.pt --cache ```
## 👀 Hyperparameter Optimization Now that we have our experiments and data versioned, it's time to take a look at what we can build on top! Using the code information, installed packages and environment details, the experiment itself is now **completely reproducible**. In fact, ClearML allows you to clone an experiment and even change its parameters. We can then just rerun it with these new parameters automatically, this is basically what HPO does! To **run hyperparameter optimization locally**, we've included a pre-made script for you. Just make sure a training task has been run at least once, so it is in the ClearML experiment manager, we will essentially clone it and change its hyperparameters. You'll need to fill in the ID of this `template task` in the script found at `utils/loggers/clearml/hpo.py` and then just run it :) You can change `task.execute_locally()` to `task.execute()` to put it in a ClearML queue and have a remote agent work on it instead. ```bash # To use optuna, install it first, otherwise you can change the optimizer to just be RandomSearch pip install optuna python utils/loggers/clearml/hpo.py ``` ![HPO](https://github.com/thepycoder/clearml_screenshots/raw/main/hpo.png) ## 🤯 Remote Execution (advanced) Running HPO locally is really handy, but what if we want to run our experiments on a remote machine instead? Maybe you have access to a very powerful GPU machine on-site, or you have some budget to use cloud GPUs. This is where the ClearML Agent comes into play. Check out what the agent can do here: - [YouTube video](https://youtu.be/MX3BrXnaULs) - [Documentation](https://clear.ml/docs/latest/docs/clearml_agent) In short: every experiment tracked by the experiment manager contains enough information to reproduce it on a different machine (installed packages, uncommitted changes etc.). So a ClearML agent does just that: it listens to a queue for incoming tasks and when it finds one, it recreates the environment and runs it while still reporting scalars, plots etc. to the experiment manager. You can turn any machine (a cloud VM, a local GPU machine, your own laptop ... ) into a ClearML agent by simply running: ```bash clearml-agent daemon --queue [--docker] ``` ### Cloning, Editing And Enqueuing With our agent running, we can give it some work. Remember from the HPO section that we can clone a task and edit the hyperparameters? We can do that from the interface too! 🪄 Clone the experiment by right-clicking it 🎯 Edit the hyperparameters to what you wish them to be ⏳ Enqueue the task to any of the queues by right-clicking it ![Enqueue a task from the UI](https://github.com/thepycoder/clearml_screenshots/raw/main/enqueue.gif) ### Executing A Task Remotely Now you can clone a task like we explained above, or simply mark your current script by adding `task.execute_remotely()` and on execution it will be put into a queue, for the agent to start working on! To run the YOLOv5 training script remotely, all you have to do is add this line to the training.py script after the clearml logger has been instantiated: ```python # ... # Loggers data_dict = None if RANK in {-1, 0}: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance if loggers.clearml: loggers.clearml.task.execute_remotely(queue="my_queue") # <------ ADD THIS LINE # Data_dict is either None is user did not choose for ClearML dataset or is filled in by ClearML data_dict = loggers.clearml.data_dict # ... ``` When running the training script after this change, python will run the script up until that line, after which it will package the code and send it to the queue instead! ### Autoscaling workers ClearML comes with autoscalers too! This tool will automatically spin up new remote machines in the cloud of your choice (AWS, GCP, Azure) and turn them into ClearML agents for you whenever there are experiments detected in the queue. Once the tasks are processed, the autoscaler will automatically shut down the remote machines, and you stop paying! Check out the autoscalers getting started video below. [![Watch the video](https://img.youtube.com/vi/j4XVMAaUt3E/0.jpg)](https://youtu.be/j4XVMAaUt3E) ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/clearml/__init__.py ================================================ ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/clearml/clearml_utils.py ================================================ """Main Logger class for ClearML experiment tracking.""" import glob import re from pathlib import Path import numpy as np import yaml from utils.plots import Annotator, colors try: import clearml from clearml import Dataset, Task assert hasattr(clearml, '__version__') # verify package import not local dir except (ImportError, AssertionError): clearml = None def construct_dataset(clearml_info_string): """Load in a clearml dataset and fill the internal data_dict with its contents. """ dataset_id = clearml_info_string.replace('clearml://', '') dataset = Dataset.get(dataset_id=dataset_id) dataset_root_path = Path(dataset.get_local_copy()) # We'll search for the yaml file definition in the dataset yaml_filenames = list(glob.glob(str(dataset_root_path / '*.yaml')) + glob.glob(str(dataset_root_path / '*.yml'))) if len(yaml_filenames) > 1: raise ValueError('More than one yaml file was found in the dataset root, cannot determine which one contains ' 'the dataset definition this way.') elif len(yaml_filenames) == 0: raise ValueError('No yaml definition found in dataset root path, check that there is a correct yaml file ' 'inside the dataset root path.') with open(yaml_filenames[0]) as f: dataset_definition = yaml.safe_load(f) assert set(dataset_definition.keys()).issuperset( {'train', 'test', 'val', 'nc', 'names'} ), "The right keys were not found in the yaml file, make sure it at least has the following keys: ('train', 'test', 'val', 'nc', 'names')" data_dict = dict() data_dict['train'] = str( (dataset_root_path / dataset_definition['train']).resolve()) if dataset_definition['train'] else None data_dict['test'] = str( (dataset_root_path / dataset_definition['test']).resolve()) if dataset_definition['test'] else None data_dict['val'] = str( (dataset_root_path / dataset_definition['val']).resolve()) if dataset_definition['val'] else None data_dict['nc'] = dataset_definition['nc'] data_dict['names'] = dataset_definition['names'] return data_dict class ClearmlLogger: """Log training runs, datasets, models, and predictions to ClearML. This logger sends information to ClearML at app.clear.ml or to your own hosted server. By default, this information includes hyperparameters, system configuration and metrics, model metrics, code information and basic data metrics and analyses. By providing additional command line arguments to train.py, datasets, models and predictions can also be logged. """ def __init__(self, opt, hyp): """ - Initialize ClearML Task, this object will capture the experiment - Upload dataset version to ClearML Data if opt.upload_dataset is True arguments: opt (namespace) -- Commandline arguments for this run hyp (dict) -- Hyperparameters for this run """ self.current_epoch = 0 # Keep tracked of amount of logged images to enforce a limit self.current_epoch_logged_images = set() # Maximum number of images to log to clearML per epoch self.max_imgs_to_log_per_epoch = 16 # Get the interval of epochs when bounding box images should be logged self.bbox_interval = opt.bbox_interval self.clearml = clearml self.task = None self.data_dict = None if self.clearml: self.task = Task.init( project_name=opt.project if opt.project != 'runs/train' else 'YOLOv5', task_name=opt.name if opt.name != 'exp' else 'Training', tags=['YOLOv5'], output_uri=True, reuse_last_task_id=opt.exist_ok, auto_connect_frameworks={'pytorch': False} # We disconnect pytorch auto-detection, because we added manual model save points in the code ) # ClearML's hooks will already grab all general parameters # Only the hyperparameters coming from the yaml config file # will have to be added manually! self.task.connect(hyp, name='Hyperparameters') self.task.connect(opt, name='Args') # Make sure the code is easily remotely runnable by setting the docker image to use by the remote agent self.task.set_base_docker('ultralytics/yolov5:latest', docker_arguments='--ipc=host -e="CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1"', docker_setup_bash_script='pip install clearml') # Get ClearML Dataset Version if requested if opt.data.startswith('clearml://'): # data_dict should have the following keys: # names, nc (number of classes), test, train, val (all three relative paths to ../datasets) self.data_dict = construct_dataset(opt.data) # Set data to data_dict because wandb will crash without this information and opt is the best way # to give it to them opt.data = self.data_dict def log_debug_samples(self, files, title='Debug Samples'): """ Log files (images) as debug samples in the ClearML task. arguments: files (List(PosixPath)) a list of file paths in PosixPath format title (str) A title that groups together images with the same values """ for f in files: if f.exists(): it = re.search(r'_batch(\d+)', f.name) iteration = int(it.groups()[0]) if it else 0 self.task.get_logger().report_image(title=title, series=f.name.replace(it.group(), ''), local_path=str(f), iteration=iteration) def log_image_with_boxes(self, image_path, boxes, class_names, image, conf_threshold=0.25): """ Draw the bounding boxes on a single image and report the result as a ClearML debug sample. arguments: image_path (PosixPath) the path the original image file boxes (list): list of scaled predictions in the format - [xmin, ymin, xmax, ymax, confidence, class] class_names (dict): dict containing mapping of class int to class name image (Tensor): A torch tensor containing the actual image data """ if len(self.current_epoch_logged_images) < self.max_imgs_to_log_per_epoch and self.current_epoch >= 0: # Log every bbox_interval times and deduplicate for any intermittend extra eval runs if self.current_epoch % self.bbox_interval == 0 and image_path not in self.current_epoch_logged_images: im = np.ascontiguousarray(np.moveaxis(image.mul(255).clamp(0, 255).byte().cpu().numpy(), 0, 2)) annotator = Annotator(im=im, pil=True) for i, (conf, class_nr, box) in enumerate(zip(boxes[:, 4], boxes[:, 5], boxes[:, :4])): color = colors(i) class_name = class_names[int(class_nr)] confidence_percentage = round(float(conf) * 100, 2) label = f'{class_name}: {confidence_percentage}%' if conf > conf_threshold: annotator.rectangle(box.cpu().numpy(), outline=color) annotator.box_label(box.cpu().numpy(), label=label, color=color) annotated_image = annotator.result() self.task.get_logger().report_image(title='Bounding Boxes', series=image_path.name, iteration=self.current_epoch, image=annotated_image) self.current_epoch_logged_images.add(image_path) ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/clearml/hpo.py ================================================ from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically from clearml.automation import HyperParameterOptimizer, UniformParameterRange from clearml.automation.optuna import OptimizerOptuna task = Task.init(project_name='Hyper-Parameter Optimization', task_name='YOLOv5', task_type=Task.TaskTypes.optimizer, reuse_last_task_id=False) # Example use case: optimizer = HyperParameterOptimizer( # This is the experiment we want to optimize base_task_id='', # here we define the hyper-parameters to optimize # Notice: The parameter name should exactly match what you see in the UI: / # For Example, here we see in the base experiment a section Named: "General" # under it a parameter named "batch_size", this becomes "General/batch_size" # If you have `argparse` for example, then arguments will appear under the "Args" section, # and you should instead pass "Args/batch_size" hyper_parameters=[ UniformParameterRange('Hyperparameters/lr0', min_value=1e-5, max_value=1e-1), UniformParameterRange('Hyperparameters/lrf', min_value=0.01, max_value=1.0), UniformParameterRange('Hyperparameters/momentum', min_value=0.6, max_value=0.98), UniformParameterRange('Hyperparameters/weight_decay', min_value=0.0, max_value=0.001), UniformParameterRange('Hyperparameters/warmup_epochs', min_value=0.0, max_value=5.0), UniformParameterRange('Hyperparameters/warmup_momentum', min_value=0.0, max_value=0.95), UniformParameterRange('Hyperparameters/warmup_bias_lr', min_value=0.0, max_value=0.2), UniformParameterRange('Hyperparameters/box', min_value=0.02, max_value=0.2), UniformParameterRange('Hyperparameters/cls', min_value=0.2, max_value=4.0), UniformParameterRange('Hyperparameters/cls_pw', min_value=0.5, max_value=2.0), UniformParameterRange('Hyperparameters/obj', min_value=0.2, max_value=4.0), UniformParameterRange('Hyperparameters/obj_pw', min_value=0.5, max_value=2.0), UniformParameterRange('Hyperparameters/iou_t', min_value=0.1, max_value=0.7), UniformParameterRange('Hyperparameters/anchor_t', min_value=2.0, max_value=8.0), UniformParameterRange('Hyperparameters/fl_gamma', min_value=0.0, max_value=4.0), UniformParameterRange('Hyperparameters/hsv_h', min_value=0.0, max_value=0.1), UniformParameterRange('Hyperparameters/hsv_s', min_value=0.0, max_value=0.9), UniformParameterRange('Hyperparameters/hsv_v', min_value=0.0, max_value=0.9), UniformParameterRange('Hyperparameters/degrees', min_value=0.0, max_value=45.0), UniformParameterRange('Hyperparameters/translate', min_value=0.0, max_value=0.9), UniformParameterRange('Hyperparameters/scale', min_value=0.0, max_value=0.9), UniformParameterRange('Hyperparameters/shear', min_value=0.0, max_value=10.0), UniformParameterRange('Hyperparameters/perspective', min_value=0.0, max_value=0.001), UniformParameterRange('Hyperparameters/flipud', min_value=0.0, max_value=1.0), UniformParameterRange('Hyperparameters/fliplr', min_value=0.0, max_value=1.0), UniformParameterRange('Hyperparameters/mosaic', min_value=0.0, max_value=1.0), UniformParameterRange('Hyperparameters/mixup', min_value=0.0, max_value=1.0), UniformParameterRange('Hyperparameters/copy_paste', min_value=0.0, max_value=1.0)], # this is the objective metric we want to maximize/minimize objective_metric_title='metrics', objective_metric_series='mAP_0.5', # now we decide if we want to maximize it or minimize it (accuracy we maximize) objective_metric_sign='max', # let us limit the number of concurrent experiments, # this in turn will make sure we do dont bombard the scheduler with experiments. # if we have an auto-scaler connected, this, by proxy, will limit the number of machine max_number_of_concurrent_tasks=1, # this is the optimizer class (actually doing the optimization) # Currently, we can choose from GridSearch, RandomSearch or OptimizerBOHB (Bayesian optimization Hyper-Band) optimizer_class=OptimizerOptuna, # If specified only the top K performing Tasks will be kept, the others will be automatically archived save_top_k_tasks_only=5, # 5, compute_time_limit=None, total_max_jobs=20, min_iteration_per_job=None, max_iteration_per_job=None, ) # report every 10 seconds, this is way too often, but we are testing here optimizer.set_report_period(10 / 60) # You can also use the line below instead to run all the optimizer tasks locally, without using queues or agent # an_optimizer.start_locally(job_complete_callback=job_complete_callback) # set the time limit for the optimization process (2 hours) optimizer.set_time_limit(in_minutes=120.0) # Start the optimization process in the local environment optimizer.start_locally() # wait until process is done (notice we are controlling the optimization process in the background) optimizer.wait() # make sure background optimization stopped optimizer.stop() print('We are done, good bye') ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/comet/README.md ================================================ # YOLOv5 with Comet This guide will cover how to use YOLOv5 with [Comet](https://bit.ly/yolov5-readme-comet2) # About Comet Comet builds tools that help data scientists, engineers, and team leaders accelerate and optimize machine learning and deep learning models. Track and visualize model metrics in real time, save your hyperparameters, datasets, and model checkpoints, and visualize your model predictions with [Comet Custom Panels](https://www.comet.com/docs/v2/guides/comet-dashboard/code-panels/about-panels/?utm_source=yolov5&utm_medium=partner&utm_campaign=partner_yolov5_2022&utm_content=github)! Comet makes sure you never lose track of your work and makes it easy to share results and collaborate across teams of all sizes! # Getting Started ## Install Comet ```shell pip install comet_ml ``` ## Configure Comet Credentials There are two ways to configure Comet with YOLOv5. You can either set your credentials through environment variables **Environment Variables** ```shell export COMET_API_KEY= export COMET_PROJECT_NAME= # This will default to 'yolov5' ``` Or create a `.comet.config` file in your working directory and set your credentials there. **Comet Configuration File** ``` [comet] api_key= project_name= # This will default to 'yolov5' ``` ## Run the Training Script ```shell # Train YOLOv5s on COCO128 for 5 epochs python train.py --img 640 --batch 16 --epochs 5 --data coco128.yaml --weights yolov5s.pt ``` That's it! Comet will automatically log your hyperparameters, command line arguments, training and validation metrics. You can visualize and analyze your runs in the Comet UI yolo-ui # Try out an Example! Check out an example of a [completed run here](https://www.comet.com/examples/comet-example-yolov5/a0e29e0e9b984e4a822db2a62d0cb357?experiment-tab=chart&showOutliers=true&smoothing=0&transformY=smoothing&xAxis=step&utm_source=yolov5&utm_medium=partner&utm_campaign=partner_yolov5_2022&utm_content=github) Or better yet, try it out yourself in this Colab Notebook [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1RG0WOQyxlDlo5Km8GogJpIEJlg_5lyYO?usp=sharing) # Log automatically By default, Comet will log the following items ## Metrics - Box Loss, Object Loss, Classification Loss for the training and validation data - mAP_0.5, mAP_0.5:0.95 metrics for the validation data. - Precision and Recall for the validation data ## Parameters - Model Hyperparameters - All parameters passed through the command line options ## Visualizations - Confusion Matrix of the model predictions on the validation data - Plots for the PR and F1 curves across all classes - Correlogram of the Class Labels # Configure Comet Logging Comet can be configured to log additional data either through command line flags passed to the training script or through environment variables. ```shell export COMET_MODE=online # Set whether to run Comet in 'online' or 'offline' mode. Defaults to online export COMET_MODEL_NAME= #Set the name for the saved model. Defaults to yolov5 export COMET_LOG_CONFUSION_MATRIX=false # Set to disable logging a Comet Confusion Matrix. Defaults to true export COMET_MAX_IMAGE_UPLOADS= # Controls how many total image predictions to log to Comet. Defaults to 100. export COMET_LOG_PER_CLASS_METRICS=true # Set to log evaluation metrics for each detected class at the end of training. Defaults to false export COMET_DEFAULT_CHECKPOINT_FILENAME= # Set this if you would like to resume training from a different checkpoint. Defaults to 'last.pt' export COMET_LOG_BATCH_LEVEL_METRICS=true # Set this if you would like to log training metrics at the batch level. Defaults to false. export COMET_LOG_PREDICTIONS=true # Set this to false to disable logging model predictions ``` ## Logging Checkpoints with Comet Logging Models to Comet is disabled by default. To enable it, pass the `save-period` argument to the training script. This will save the logged checkpoints to Comet based on the interval value provided by `save-period` ```shell python train.py \ --img 640 \ --batch 16 \ --epochs 5 \ --data coco128.yaml \ --weights yolov5s.pt \ --save-period 1 ``` ## Logging Model Predictions By default, model predictions (images, ground truth labels and bounding boxes) will be logged to Comet. You can control the frequency of logged predictions and the associated images by passing the `bbox_interval` command line argument. Predictions can be visualized using Comet's Object Detection Custom Panel. This frequency corresponds to every Nth batch of data per epoch. In the example below, we are logging every 2nd batch of data for each epoch. **Note:** The YOLOv5 validation dataloader will default to a batch size of 32, so you will have to set the logging frequency accordingly. Here is an [example project using the Panel](https://www.comet.com/examples/comet-example-yolov5?shareable=YcwMiJaZSXfcEXpGOHDD12vA1&utm_source=yolov5&utm_medium=partner&utm_campaign=partner_yolov5_2022&utm_content=github) ```shell python train.py \ --img 640 \ --batch 16 \ --epochs 5 \ --data coco128.yaml \ --weights yolov5s.pt \ --bbox_interval 2 ``` ### Controlling the number of Prediction Images logged to Comet When logging predictions from YOLOv5, Comet will log the images associated with each set of predictions. By default a maximum of 100 validation images are logged. You can increase or decrease this number using the `COMET_MAX_IMAGE_UPLOADS` environment variable. ```shell env COMET_MAX_IMAGE_UPLOADS=200 python train.py \ --img 640 \ --batch 16 \ --epochs 5 \ --data coco128.yaml \ --weights yolov5s.pt \ --bbox_interval 1 ``` ### Logging Class Level Metrics Use the `COMET_LOG_PER_CLASS_METRICS` environment variable to log mAP, precision, recall, f1 for each class. ```shell env COMET_LOG_PER_CLASS_METRICS=true python train.py \ --img 640 \ --batch 16 \ --epochs 5 \ --data coco128.yaml \ --weights yolov5s.pt ``` ## Uploading a Dataset to Comet Artifacts If you would like to store your data using [Comet Artifacts](https://www.comet.com/docs/v2/guides/data-management/using-artifacts/#learn-more?utm_source=yolov5&utm_medium=partner&utm_campaign=partner_yolov5_2022&utm_content=github), you can do so using the `upload_dataset` flag. The dataset be organized in the way described in the [YOLOv5 documentation](https://docs.ultralytics.com/tutorials/train-custom-datasets/#3-organize-directories). The dataset config `yaml` file must follow the same format as that of the `coco128.yaml` file. ```shell python train.py \ --img 640 \ --batch 16 \ --epochs 5 \ --data coco128.yaml \ --weights yolov5s.pt \ --upload_dataset ``` You can find the uploaded dataset in the Artifacts tab in your Comet Workspace artifact-1 You can preview the data directly in the Comet UI. artifact-2 Artifacts are versioned and also support adding metadata about the dataset. Comet will automatically log the metadata from your dataset `yaml` file artifact-3 ### Using a saved Artifact If you would like to use a dataset from Comet Artifacts, set the `path` variable in your dataset `yaml` file to point to the following Artifact resource URL. ``` # contents of artifact.yaml file path: "comet:///:" ``` Then pass this file to your training script in the following way ```shell python train.py \ --img 640 \ --batch 16 \ --epochs 5 \ --data artifact.yaml \ --weights yolov5s.pt ``` Artifacts also allow you to track the lineage of data as it flows through your Experimentation workflow. Here you can see a graph that shows you all the experiments that have used your uploaded dataset. artifact-4 ## Resuming a Training Run If your training run is interrupted for any reason, e.g. disrupted internet connection, you can resume the run using the `resume` flag and the Comet Run Path. The Run Path has the following format `comet:////`. This will restore the run to its state before the interruption, which includes restoring the model from a checkpoint, restoring all hyperparameters and training arguments and downloading Comet dataset Artifacts if they were used in the original run. The resumed run will continue logging to the existing Experiment in the Comet UI ```shell python train.py \ --resume "comet://" ``` ## Hyperparameter Search with the Comet Optimizer YOLOv5 is also integrated with Comet's Optimizer, making is simple to visualize hyperparameter sweeps in the Comet UI. ### Configuring an Optimizer Sweep To configure the Comet Optimizer, you will have to create a JSON file with the information about the sweep. An example file has been provided in `utils/loggers/comet/optimizer_config.json` ```shell python utils/loggers/comet/hpo.py \ --comet_optimizer_config "utils/loggers/comet/optimizer_config.json" ``` The `hpo.py` script accepts the same arguments as `train.py`. If you wish to pass additional arguments to your sweep simply add them after the script. ```shell python utils/loggers/comet/hpo.py \ --comet_optimizer_config "utils/loggers/comet/optimizer_config.json" \ --save-period 1 \ --bbox_interval 1 ``` ### Running a Sweep in Parallel ```shell comet optimizer -j utils/loggers/comet/hpo.py \ utils/loggers/comet/optimizer_config.json" ``` ### Visualizing Results Comet provides a number of ways to visualize the results of your sweep. Take a look at a [project with a completed sweep here](https://www.comet.com/examples/comet-example-yolov5/view/PrlArHGuuhDTKC1UuBmTtOSXD/panels?utm_source=yolov5&utm_medium=partner&utm_campaign=partner_yolov5_2022&utm_content=github) hyperparameter-yolo ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/comet/__init__.py ================================================ import glob import json import logging import os import sys from pathlib import Path logger = logging.getLogger(__name__) FILE = Path(__file__).resolve() ROOT = FILE.parents[3] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH try: import comet_ml # Project Configuration config = comet_ml.config.get_config() COMET_PROJECT_NAME = config.get_string(os.getenv('COMET_PROJECT_NAME'), 'comet.project_name', default='yolov5') except (ModuleNotFoundError, ImportError): comet_ml = None COMET_PROJECT_NAME = None import PIL import torch import torchvision.transforms as T import yaml from utils.dataloaders import img2label_paths from utils.general import check_dataset, scale_boxes, xywh2xyxy from utils.metrics import box_iou COMET_PREFIX = 'comet://' COMET_MODE = os.getenv('COMET_MODE', 'online') # Model Saving Settings COMET_MODEL_NAME = os.getenv('COMET_MODEL_NAME', 'yolov5') # Dataset Artifact Settings COMET_UPLOAD_DATASET = os.getenv('COMET_UPLOAD_DATASET', 'false').lower() == 'true' # Evaluation Settings COMET_LOG_CONFUSION_MATRIX = os.getenv('COMET_LOG_CONFUSION_MATRIX', 'true').lower() == 'true' COMET_LOG_PREDICTIONS = os.getenv('COMET_LOG_PREDICTIONS', 'true').lower() == 'true' COMET_MAX_IMAGE_UPLOADS = int(os.getenv('COMET_MAX_IMAGE_UPLOADS', 100)) # Confusion Matrix Settings CONF_THRES = float(os.getenv('CONF_THRES', 0.001)) IOU_THRES = float(os.getenv('IOU_THRES', 0.6)) # Batch Logging Settings COMET_LOG_BATCH_METRICS = os.getenv('COMET_LOG_BATCH_METRICS', 'false').lower() == 'true' COMET_BATCH_LOGGING_INTERVAL = os.getenv('COMET_BATCH_LOGGING_INTERVAL', 1) COMET_PREDICTION_LOGGING_INTERVAL = os.getenv('COMET_PREDICTION_LOGGING_INTERVAL', 1) COMET_LOG_PER_CLASS_METRICS = os.getenv('COMET_LOG_PER_CLASS_METRICS', 'false').lower() == 'true' RANK = int(os.getenv('RANK', -1)) to_pil = T.ToPILImage() class CometLogger: """Log metrics, parameters, source code, models and much more with Comet """ def __init__(self, opt, hyp, run_id=None, job_type='Training', **experiment_kwargs) -> None: self.job_type = job_type self.opt = opt self.hyp = hyp # Comet Flags self.comet_mode = COMET_MODE self.save_model = opt.save_period > -1 self.model_name = COMET_MODEL_NAME # Batch Logging Settings self.log_batch_metrics = COMET_LOG_BATCH_METRICS self.comet_log_batch_interval = COMET_BATCH_LOGGING_INTERVAL # Dataset Artifact Settings self.upload_dataset = self.opt.upload_dataset if self.opt.upload_dataset else COMET_UPLOAD_DATASET self.resume = self.opt.resume # Default parameters to pass to Experiment objects self.default_experiment_kwargs = { 'log_code': False, 'log_env_gpu': True, 'log_env_cpu': True, 'project_name': COMET_PROJECT_NAME,} self.default_experiment_kwargs.update(experiment_kwargs) self.experiment = self._get_experiment(self.comet_mode, run_id) self.data_dict = self.check_dataset(self.opt.data) self.class_names = self.data_dict['names'] self.num_classes = self.data_dict['nc'] self.logged_images_count = 0 self.max_images = COMET_MAX_IMAGE_UPLOADS if run_id is None: self.experiment.log_other('Created from', 'YOLOv5') if not isinstance(self.experiment, comet_ml.OfflineExperiment): workspace, project_name, experiment_id = self.experiment.url.split('/')[-3:] self.experiment.log_other( 'Run Path', f'{workspace}/{project_name}/{experiment_id}', ) self.log_parameters(vars(opt)) self.log_parameters(self.opt.hyp) self.log_asset_data( self.opt.hyp, name='hyperparameters.json', metadata={'type': 'hyp-config-file'}, ) self.log_asset( f'{self.opt.save_dir}/opt.yaml', metadata={'type': 'opt-config-file'}, ) self.comet_log_confusion_matrix = COMET_LOG_CONFUSION_MATRIX if hasattr(self.opt, 'conf_thres'): self.conf_thres = self.opt.conf_thres else: self.conf_thres = CONF_THRES if hasattr(self.opt, 'iou_thres'): self.iou_thres = self.opt.iou_thres else: self.iou_thres = IOU_THRES self.log_parameters({'val_iou_threshold': self.iou_thres, 'val_conf_threshold': self.conf_thres}) self.comet_log_predictions = COMET_LOG_PREDICTIONS if self.opt.bbox_interval == -1: self.comet_log_prediction_interval = 1 if self.opt.epochs < 10 else self.opt.epochs // 10 else: self.comet_log_prediction_interval = self.opt.bbox_interval if self.comet_log_predictions: self.metadata_dict = {} self.logged_image_names = [] self.comet_log_per_class_metrics = COMET_LOG_PER_CLASS_METRICS self.experiment.log_others({ 'comet_mode': COMET_MODE, 'comet_max_image_uploads': COMET_MAX_IMAGE_UPLOADS, 'comet_log_per_class_metrics': COMET_LOG_PER_CLASS_METRICS, 'comet_log_batch_metrics': COMET_LOG_BATCH_METRICS, 'comet_log_confusion_matrix': COMET_LOG_CONFUSION_MATRIX, 'comet_model_name': COMET_MODEL_NAME,}) # Check if running the Experiment with the Comet Optimizer if hasattr(self.opt, 'comet_optimizer_id'): self.experiment.log_other('optimizer_id', self.opt.comet_optimizer_id) self.experiment.log_other('optimizer_objective', self.opt.comet_optimizer_objective) self.experiment.log_other('optimizer_metric', self.opt.comet_optimizer_metric) self.experiment.log_other('optimizer_parameters', json.dumps(self.hyp)) def _get_experiment(self, mode, experiment_id=None): if mode == 'offline': if experiment_id is not None: return comet_ml.ExistingOfflineExperiment( previous_experiment=experiment_id, **self.default_experiment_kwargs, ) return comet_ml.OfflineExperiment(**self.default_experiment_kwargs,) else: try: if experiment_id is not None: return comet_ml.ExistingExperiment( previous_experiment=experiment_id, **self.default_experiment_kwargs, ) return comet_ml.Experiment(**self.default_experiment_kwargs) except ValueError: logger.warning('COMET WARNING: ' 'Comet credentials have not been set. ' 'Comet will default to offline logging. ' 'Please set your credentials to enable online logging.') return self._get_experiment('offline', experiment_id) return def log_metrics(self, log_dict, **kwargs): self.experiment.log_metrics(log_dict, **kwargs) def log_parameters(self, log_dict, **kwargs): self.experiment.log_parameters(log_dict, **kwargs) def log_asset(self, asset_path, **kwargs): self.experiment.log_asset(asset_path, **kwargs) def log_asset_data(self, asset, **kwargs): self.experiment.log_asset_data(asset, **kwargs) def log_image(self, img, **kwargs): self.experiment.log_image(img, **kwargs) def log_model(self, path, opt, epoch, fitness_score, best_model=False): if not self.save_model: return model_metadata = { 'fitness_score': fitness_score[-1], 'epochs_trained': epoch + 1, 'save_period': opt.save_period, 'total_epochs': opt.epochs,} model_files = glob.glob(f'{path}/*.pt') for model_path in model_files: name = Path(model_path).name self.experiment.log_model( self.model_name, file_or_folder=model_path, file_name=name, metadata=model_metadata, overwrite=True, ) def check_dataset(self, data_file): with open(data_file) as f: data_config = yaml.safe_load(f) if data_config['path'].startswith(COMET_PREFIX): path = data_config['path'].replace(COMET_PREFIX, '') data_dict = self.download_dataset_artifact(path) return data_dict self.log_asset(self.opt.data, metadata={'type': 'data-config-file'}) return check_dataset(data_file) def log_predictions(self, image, labelsn, path, shape, predn): if self.logged_images_count >= self.max_images: return detections = predn[predn[:, 4] > self.conf_thres] iou = box_iou(labelsn[:, 1:], detections[:, :4]) mask, _ = torch.where(iou > self.iou_thres) if len(mask) == 0: return filtered_detections = detections[mask] filtered_labels = labelsn[mask] image_id = path.split('/')[-1].split('.')[0] image_name = f'{image_id}_curr_epoch_{self.experiment.curr_epoch}' if image_name not in self.logged_image_names: native_scale_image = PIL.Image.open(path) self.log_image(native_scale_image, name=image_name) self.logged_image_names.append(image_name) metadata = [] for cls, *xyxy in filtered_labels.tolist(): metadata.append({ 'label': f'{self.class_names[int(cls)]}-gt', 'score': 100, 'box': { 'x': xyxy[0], 'y': xyxy[1], 'x2': xyxy[2], 'y2': xyxy[3]},}) for *xyxy, conf, cls in filtered_detections.tolist(): metadata.append({ 'label': f'{self.class_names[int(cls)]}', 'score': conf * 100, 'box': { 'x': xyxy[0], 'y': xyxy[1], 'x2': xyxy[2], 'y2': xyxy[3]},}) self.metadata_dict[image_name] = metadata self.logged_images_count += 1 return def preprocess_prediction(self, image, labels, shape, pred): nl, _ = labels.shape[0], pred.shape[0] # Predictions if self.opt.single_cls: pred[:, 5] = 0 predn = pred.clone() scale_boxes(image.shape[1:], predn[:, :4], shape[0], shape[1]) labelsn = None if nl: tbox = xywh2xyxy(labels[:, 1:5]) # target boxes scale_boxes(image.shape[1:], tbox, shape[0], shape[1]) # native-space labels labelsn = torch.cat((labels[:, 0:1], tbox), 1) # native-space labels scale_boxes(image.shape[1:], predn[:, :4], shape[0], shape[1]) # native-space pred return predn, labelsn def add_assets_to_artifact(self, artifact, path, asset_path, split): img_paths = sorted(glob.glob(f'{asset_path}/*')) label_paths = img2label_paths(img_paths) for image_file, label_file in zip(img_paths, label_paths): image_logical_path, label_logical_path = map(lambda x: os.path.relpath(x, path), [image_file, label_file]) try: artifact.add(image_file, logical_path=image_logical_path, metadata={'split': split}) artifact.add(label_file, logical_path=label_logical_path, metadata={'split': split}) except ValueError as e: logger.error('COMET ERROR: Error adding file to Artifact. Skipping file.') logger.error(f'COMET ERROR: {e}') continue return artifact def upload_dataset_artifact(self): dataset_name = self.data_dict.get('dataset_name', 'yolov5-dataset') path = str((ROOT / Path(self.data_dict['path'])).resolve()) metadata = self.data_dict.copy() for key in ['train', 'val', 'test']: split_path = metadata.get(key) if split_path is not None: metadata[key] = split_path.replace(path, '') artifact = comet_ml.Artifact(name=dataset_name, artifact_type='dataset', metadata=metadata) for key in metadata.keys(): if key in ['train', 'val', 'test']: if isinstance(self.upload_dataset, str) and (key != self.upload_dataset): continue asset_path = self.data_dict.get(key) if asset_path is not None: artifact = self.add_assets_to_artifact(artifact, path, asset_path, key) self.experiment.log_artifact(artifact) return def download_dataset_artifact(self, artifact_path): logged_artifact = self.experiment.get_artifact(artifact_path) artifact_save_dir = str(Path(self.opt.save_dir) / logged_artifact.name) logged_artifact.download(artifact_save_dir) metadata = logged_artifact.metadata data_dict = metadata.copy() data_dict['path'] = artifact_save_dir metadata_names = metadata.get('names') if type(metadata_names) == dict: data_dict['names'] = {int(k): v for k, v in metadata.get('names').items()} elif type(metadata_names) == list: data_dict['names'] = {int(k): v for k, v in zip(range(len(metadata_names)), metadata_names)} else: raise "Invalid 'names' field in dataset yaml file. Please use a list or dictionary" data_dict = self.update_data_paths(data_dict) return data_dict def update_data_paths(self, data_dict): path = data_dict.get('path', '') for split in ['train', 'val', 'test']: if data_dict.get(split): split_path = data_dict.get(split) data_dict[split] = (f'{path}/{split_path}' if isinstance(split, str) else [ f'{path}/{x}' for x in split_path]) return data_dict def on_pretrain_routine_end(self, paths): if self.opt.resume: return for path in paths: self.log_asset(str(path)) if self.upload_dataset: if not self.resume: self.upload_dataset_artifact() return def on_train_start(self): self.log_parameters(self.hyp) def on_train_epoch_start(self): return def on_train_epoch_end(self, epoch): self.experiment.curr_epoch = epoch return def on_train_batch_start(self): return def on_train_batch_end(self, log_dict, step): self.experiment.curr_step = step if self.log_batch_metrics and (step % self.comet_log_batch_interval == 0): self.log_metrics(log_dict, step=step) return def on_train_end(self, files, save_dir, last, best, epoch, results): if self.comet_log_predictions: curr_epoch = self.experiment.curr_epoch self.experiment.log_asset_data(self.metadata_dict, 'image-metadata.json', epoch=curr_epoch) for f in files: self.log_asset(f, metadata={'epoch': epoch}) self.log_asset(f'{save_dir}/results.csv', metadata={'epoch': epoch}) if not self.opt.evolve: model_path = str(best if best.exists() else last) name = Path(model_path).name if self.save_model: self.experiment.log_model( self.model_name, file_or_folder=model_path, file_name=name, overwrite=True, ) # Check if running Experiment with Comet Optimizer if hasattr(self.opt, 'comet_optimizer_id'): metric = results.get(self.opt.comet_optimizer_metric) self.experiment.log_other('optimizer_metric_value', metric) self.finish_run() def on_val_start(self): return def on_val_batch_start(self): return def on_val_batch_end(self, batch_i, images, targets, paths, shapes, outputs): if not (self.comet_log_predictions and ((batch_i + 1) % self.comet_log_prediction_interval == 0)): return for si, pred in enumerate(outputs): if len(pred) == 0: continue image = images[si] labels = targets[targets[:, 0] == si, 1:] shape = shapes[si] path = paths[si] predn, labelsn = self.preprocess_prediction(image, labels, shape, pred) if labelsn is not None: self.log_predictions(image, labelsn, path, shape, predn) return def on_val_end(self, nt, tp, fp, p, r, f1, ap, ap50, ap_class, confusion_matrix): if self.comet_log_per_class_metrics: if self.num_classes > 1: for i, c in enumerate(ap_class): class_name = self.class_names[c] self.experiment.log_metrics( { 'mAP@.5': ap50[i], 'mAP@.5:.95': ap[i], 'precision': p[i], 'recall': r[i], 'f1': f1[i], 'true_positives': tp[i], 'false_positives': fp[i], 'support': nt[c]}, prefix=class_name) if self.comet_log_confusion_matrix: epoch = self.experiment.curr_epoch class_names = list(self.class_names.values()) class_names.append('background') num_classes = len(class_names) self.experiment.log_confusion_matrix( matrix=confusion_matrix.matrix, max_categories=num_classes, labels=class_names, epoch=epoch, column_label='Actual Category', row_label='Predicted Category', file_name=f'confusion-matrix-epoch-{epoch}.json', ) def on_fit_epoch_end(self, result, epoch): self.log_metrics(result, epoch=epoch) def on_model_save(self, last, epoch, final_epoch, best_fitness, fi): if ((epoch + 1) % self.opt.save_period == 0 and not final_epoch) and self.opt.save_period != -1: self.log_model(last.parent, self.opt, epoch, fi, best_model=best_fitness == fi) def on_params_update(self, params): self.log_parameters(params) def finish_run(self): self.experiment.end() ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/comet/comet_utils.py ================================================ import logging import os from urllib.parse import urlparse try: import comet_ml except (ModuleNotFoundError, ImportError): comet_ml = None import yaml logger = logging.getLogger(__name__) COMET_PREFIX = 'comet://' COMET_MODEL_NAME = os.getenv('COMET_MODEL_NAME', 'yolov5') COMET_DEFAULT_CHECKPOINT_FILENAME = os.getenv('COMET_DEFAULT_CHECKPOINT_FILENAME', 'last.pt') def download_model_checkpoint(opt, experiment): model_dir = f'{opt.project}/{experiment.name}' os.makedirs(model_dir, exist_ok=True) model_name = COMET_MODEL_NAME model_asset_list = experiment.get_model_asset_list(model_name) if len(model_asset_list) == 0: logger.error(f'COMET ERROR: No checkpoints found for model name : {model_name}') return model_asset_list = sorted( model_asset_list, key=lambda x: x['step'], reverse=True, ) logged_checkpoint_map = {asset['fileName']: asset['assetId'] for asset in model_asset_list} resource_url = urlparse(opt.weights) checkpoint_filename = resource_url.query if checkpoint_filename: asset_id = logged_checkpoint_map.get(checkpoint_filename) else: asset_id = logged_checkpoint_map.get(COMET_DEFAULT_CHECKPOINT_FILENAME) checkpoint_filename = COMET_DEFAULT_CHECKPOINT_FILENAME if asset_id is None: logger.error(f'COMET ERROR: Checkpoint {checkpoint_filename} not found in the given Experiment') return try: logger.info(f'COMET INFO: Downloading checkpoint {checkpoint_filename}') asset_filename = checkpoint_filename model_binary = experiment.get_asset(asset_id, return_type='binary', stream=False) model_download_path = f'{model_dir}/{asset_filename}' with open(model_download_path, 'wb') as f: f.write(model_binary) opt.weights = model_download_path except Exception as e: logger.warning('COMET WARNING: Unable to download checkpoint from Comet') logger.exception(e) def set_opt_parameters(opt, experiment): """Update the opts Namespace with parameters from Comet's ExistingExperiment when resuming a run Args: opt (argparse.Namespace): Namespace of command line options experiment (comet_ml.APIExperiment): Comet API Experiment object """ asset_list = experiment.get_asset_list() resume_string = opt.resume for asset in asset_list: if asset['fileName'] == 'opt.yaml': asset_id = asset['assetId'] asset_binary = experiment.get_asset(asset_id, return_type='binary', stream=False) opt_dict = yaml.safe_load(asset_binary) for key, value in opt_dict.items(): setattr(opt, key, value) opt.resume = resume_string # Save hyperparameters to YAML file # Necessary to pass checks in training script save_dir = f'{opt.project}/{experiment.name}' os.makedirs(save_dir, exist_ok=True) hyp_yaml_path = f'{save_dir}/hyp.yaml' with open(hyp_yaml_path, 'w') as f: yaml.dump(opt.hyp, f) opt.hyp = hyp_yaml_path def check_comet_weights(opt): """Downloads model weights from Comet and updates the weights path to point to saved weights location Args: opt (argparse.Namespace): Command Line arguments passed to YOLOv5 training script Returns: None/bool: Return True if weights are successfully downloaded else return None """ if comet_ml is None: return if isinstance(opt.weights, str): if opt.weights.startswith(COMET_PREFIX): api = comet_ml.API() resource = urlparse(opt.weights) experiment_path = f'{resource.netloc}{resource.path}' experiment = api.get(experiment_path) download_model_checkpoint(opt, experiment) return True return None def check_comet_resume(opt): """Restores run parameters to its original state based on the model checkpoint and logged Experiment parameters. Args: opt (argparse.Namespace): Command Line arguments passed to YOLOv5 training script Returns: None/bool: Return True if the run is restored successfully else return None """ if comet_ml is None: return if isinstance(opt.resume, str): if opt.resume.startswith(COMET_PREFIX): api = comet_ml.API() resource = urlparse(opt.resume) experiment_path = f'{resource.netloc}{resource.path}' experiment = api.get(experiment_path) set_opt_parameters(opt, experiment) download_model_checkpoint(opt, experiment) return True return None ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/comet/hpo.py ================================================ import argparse import json import logging import os import sys from pathlib import Path import comet_ml logger = logging.getLogger(__name__) FILE = Path(__file__).resolve() ROOT = FILE.parents[3] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH from train import train from utils.callbacks import Callbacks from utils.general import increment_path from utils.torch_utils import select_device # Project Configuration config = comet_ml.config.get_config() COMET_PROJECT_NAME = config.get_string(os.getenv('COMET_PROJECT_NAME'), 'comet.project_name', default='yolov5') def get_args(known=False): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='initial weights path') parser.add_argument('--cfg', type=str, default='', help='model.yaml path') parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path') parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch-low.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=300, help='total training epochs') parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs, -1 for autobatch') parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') parser.add_argument('--noval', action='store_true', help='only validate final epoch') parser.add_argument('--noautoanchor', action='store_true', help='disable AutoAnchor') parser.add_argument('--noplots', action='store_true', help='save no plot files') parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations') parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"') parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'AdamW'], default='SGD', help='optimizer') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)') parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name') parser.add_argument('--name', default='exp', help='save to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--quad', action='store_true', help='quad dataloader') parser.add_argument('--cos-lr', action='store_true', help='cosine LR scheduler') parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon') parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)') parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2') parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)') parser.add_argument('--seed', type=int, default=0, help='Global training seed') parser.add_argument('--local_rank', type=int, default=-1, help='Automatic DDP Multi-GPU argument, do not modify') # Weights & Biases arguments parser.add_argument('--entity', default=None, help='W&B: Entity') parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='W&B: Upload data, "val" option') parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval') parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use') # Comet Arguments parser.add_argument('--comet_optimizer_config', type=str, help='Comet: Path to a Comet Optimizer Config File.') parser.add_argument('--comet_optimizer_id', type=str, help='Comet: ID of the Comet Optimizer sweep.') parser.add_argument('--comet_optimizer_objective', type=str, help="Comet: Set to 'minimize' or 'maximize'.") parser.add_argument('--comet_optimizer_metric', type=str, help='Comet: Metric to Optimize.') parser.add_argument('--comet_optimizer_workers', type=int, default=1, help='Comet: Number of Parallel Workers to use with the Comet Optimizer.') return parser.parse_known_args()[0] if known else parser.parse_args() def run(parameters, opt): hyp_dict = {k: v for k, v in parameters.items() if k not in ['epochs', 'batch_size']} opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok or opt.evolve)) opt.batch_size = parameters.get('batch_size') opt.epochs = parameters.get('epochs') device = select_device(opt.device, batch_size=opt.batch_size) train(hyp_dict, opt, device, callbacks=Callbacks()) if __name__ == '__main__': opt = get_args(known=True) opt.weights = str(opt.weights) opt.cfg = str(opt.cfg) opt.data = str(opt.data) opt.project = str(opt.project) optimizer_id = os.getenv('COMET_OPTIMIZER_ID') if optimizer_id is None: with open(opt.comet_optimizer_config) as f: optimizer_config = json.load(f) optimizer = comet_ml.Optimizer(optimizer_config) else: optimizer = comet_ml.Optimizer(optimizer_id) opt.comet_optimizer_id = optimizer.id status = optimizer.status() opt.comet_optimizer_objective = status['spec']['objective'] opt.comet_optimizer_metric = status['spec']['metric'] logger.info('COMET INFO: Starting Hyperparameter Sweep') for parameter in optimizer.get_parameters(): run(parameter['parameters'], opt) ================================================ FILE: yolo-improve/yolov5-AUX/utils/loggers/comet/optimizer_config.json ================================================ { "algorithm": "random", "parameters": { "anchor_t": { "type": "discrete", "values": [ 2, 8 ] }, "batch_size": { "type": "discrete", "values": [ 16, 32, 64 ] }, "box": { "type": "discrete", "values": [ 0.02, 0.2 ] }, "cls": { "type": "discrete", "values": [ 0.2 ] }, "cls_pw": { "type": "discrete", "values": [ 0.5 ] }, "copy_paste": { "type": "discrete", "values": [ 1 ] }, "degrees": { "type": "discrete", "values": [ 0, 45 ] }, "epochs": { "type": "discrete", "values": [ 5 ] }, "fl_gamma": { "type": "discrete", "values": [ 0 ] }, "fliplr": { "type": "discrete", "values": [ 0 ] }, "flipud": { "type": "discrete", "values": [ 0 ] }, "hsv_h": { "type": "discrete", "values": [ 0 ] }, "hsv_s": { "type": "discrete", "values": [ 0 ] }, "hsv_v": { "type": "discrete", "values": [ 0 ] }, "iou_t": { "type": "discrete", "values": [ 0.7 ] }, "lr0": { "type": "discrete", "values": [ 1e-05, 0.1 ] }, "lrf": { "type": "discrete", "values": [ 0.01, 1 ] }, "mixup": { "type": "discrete", "values": [ 1 ] }, "momentum": { "type": "discrete", "values": [ 0.6 ] }, "mosaic": { "type": "discrete", "values": [ 0 ] }, "obj": { "type": "discrete", "values": [ 0.2 ] }, "obj_pw": { "type": "discrete", "values": [ 0.5 ] }, "optimizer": { "type": "categorical", "values": [ "SGD", "Adam", "AdamW" ] }, "perspective": { "type": "discrete", "values": [ 0 ] }, "scale": { "type": "discrete", "values": [ 0 ] }, "shear": { "type": "discrete", "values": [ 0 ] }, "translate": { "type": "discrete", "values": [ 0 ] }, "warmup_bias_lr": { "type": "discrete", "values": [ 0, 0.2 ] }, "warmup_epochs": { "type": "discrete", "values": [ 5 ] }, "warmup_momentum": { "type": "discrete", "values": [ 0, 0.95 ] }, "weight_decay": { "type": "discrete", "values": [ 0, 0.001 ] } }, "spec": { "maxCombo": 0, "metric": "metrics/mAP_0.5", "objective": "maximize" }, "trials": 1 } ================================================ FILE: yolo-improve/yolov5-AUX/utils/loss.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Loss functions """ import torch import torch.nn as nn import torch.nn.functional as F from utils.metrics import bbox_iou, box_iou from utils.torch_utils import de_parallel from utils.general import xywh2xyxy def smooth_BCE(eps=0.1): # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441 # return positive, negative label smoothing BCE targets return 1.0 - 0.5 * eps, 0.5 * eps class BCEBlurWithLogitsLoss(nn.Module): # BCEwithLogitLoss() with reduced missing label effects. def __init__(self, alpha=0.05): super().__init__() self.loss_fcn = nn.BCEWithLogitsLoss(reduction='none') # must be nn.BCEWithLogitsLoss() self.alpha = alpha def forward(self, pred, true): loss = self.loss_fcn(pred, true) pred = torch.sigmoid(pred) # prob from logits dx = pred - true # reduce only missing label effects # dx = (pred - true).abs() # reduce missing label and false label effects alpha_factor = 1 - torch.exp((dx - 1) / (self.alpha + 1e-4)) loss *= alpha_factor return loss.mean() class FocalLoss(nn.Module): # Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5) def __init__(self, loss_fcn, gamma=1.5, alpha=0.25): super().__init__() self.loss_fcn = loss_fcn # must be nn.BCEWithLogitsLoss() self.gamma = gamma self.alpha = alpha self.reduction = loss_fcn.reduction self.loss_fcn.reduction = 'none' # required to apply FL to each element def forward(self, pred, true): loss = self.loss_fcn(pred, true) # p_t = torch.exp(-loss) # loss *= self.alpha * (1.000001 - p_t) ** self.gamma # non-zero power for gradient stability # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py pred_prob = torch.sigmoid(pred) # prob from logits p_t = true * pred_prob + (1 - true) * (1 - pred_prob) alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha) modulating_factor = (1.0 - p_t) ** self.gamma loss *= alpha_factor * modulating_factor if self.reduction == 'mean': return loss.mean() elif self.reduction == 'sum': return loss.sum() else: # 'none' return loss class QFocalLoss(nn.Module): # Wraps Quality focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5) def __init__(self, loss_fcn, gamma=1.5, alpha=0.25): super().__init__() self.loss_fcn = loss_fcn # must be nn.BCEWithLogitsLoss() self.gamma = gamma self.alpha = alpha self.reduction = loss_fcn.reduction self.loss_fcn.reduction = 'none' # required to apply FL to each element def forward(self, pred, true): loss = self.loss_fcn(pred, true) pred_prob = torch.sigmoid(pred) # prob from logits alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha) modulating_factor = torch.abs(true - pred_prob) ** self.gamma loss *= alpha_factor * modulating_factor if self.reduction == 'mean': return loss.mean() elif self.reduction == 'sum': return loss.sum() else: # 'none' return loss class ComputeLoss: sort_obj_iou = False # Compute losses def __init__(self, model, autobalance=False): device = next(model.parameters()).device # get model device h = model.hyp # hyperparameters # Define criteria BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device)) BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device)) # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3 self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0)) # positive, negative BCE targets # Focal loss g = h['fl_gamma'] # focal loss gamma if g > 0: BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g) m = de_parallel(model).model[-1] # Detect() module self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02]) # P3-P7 self.ssi = list(m.stride).index(16) if autobalance else 0 # stride 16 index self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance self.na = m.na # number of anchors self.nc = m.nc # number of classes self.nl = m.nl # number of layers self.anchors = m.anchors self.device = device def __call__(self, p, targets): # predictions, targets lcls = torch.zeros(1, device=self.device) # class loss lbox = torch.zeros(1, device=self.device) # box loss lobj = torch.zeros(1, device=self.device) # object loss tcls, tbox, indices, anchors = self.build_targets(p, targets) # targets # Losses for i, pi in enumerate(p): # layer index, layer predictions b, a, gj, gi = indices[i] # image, anchor, gridy, gridx tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device) # target obj n = b.shape[0] # number of targets if n: # pxy, pwh, _, pcls = pi[b, a, gj, gi].tensor_split((2, 4, 5), dim=1) # faster, requires torch 1.8.0 pxy, pwh, _, pcls = pi[b, a, gj, gi].split((2, 2, 1, self.nc), 1) # target-subset of predictions # Regression pxy = pxy.sigmoid() * 2 - 0.5 pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i] pbox = torch.cat((pxy, pwh), 1) # predicted box iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze() # iou(prediction, target) lbox += (1.0 - iou).mean() # iou loss # Objectness iou = iou.detach().clamp(0).type(tobj.dtype) if self.sort_obj_iou: j = iou.argsort() b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j] if self.gr < 1: iou = (1.0 - self.gr) + self.gr * iou tobj[b, a, gj, gi] = iou # iou ratio # Classification if self.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(pcls, self.cn, device=self.device) # targets t[range(n), tcls[i]] = self.cp lcls += self.BCEcls(pcls, t) # BCE # Append targets to text file # with open('targets.txt', 'a') as file: # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)] obji = self.BCEobj(pi[..., 4], tobj) lobj += obji * self.balance[i] # obj loss if self.autobalance: self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item() if self.autobalance: self.balance = [x / self.balance[self.ssi] for x in self.balance] lbox *= self.hyp['box'] lobj *= self.hyp['obj'] lcls *= self.hyp['cls'] bs = tobj.shape[0] # batch size return (lbox + lobj + lcls) * bs, torch.cat((lbox, lobj, lcls)).detach() def build_targets(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(7, device=self.device) # normalized to gridspace gain ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt) targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None]), 2) # append anchor indices g = 0.5 # bias off = torch.tensor( [ [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm ], device=self.device).float() * g # offsets for i in range(self.nl): anchors, shape = self.anchors[i], p[i].shape gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain # Match targets to anchors t = targets * gain # shape(3,n,7) if nt: # Matches r = t[..., 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1 < g) & (gxy > 1)).T l, m = ((gxi % 1 < g) & (gxi > 1)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Define bc, gxy, gwh, a = t.chunk(4, 1) # (image, class), grid xy, grid wh, anchors a, (b, c) = a.long().view(-1), bc.long().T # anchors, image, class gij = (gxy - offsets).long() gi, gj = gij.T # grid indices # Append indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1))) # image, anchor, grid tbox.append(torch.cat((gxy - gij, gwh), 1)) # box anch.append(anchors[a]) # anchors tcls.append(c) # class return tcls, tbox, indices, anch class ComputeLossAuxOTA: # Compute losses def __init__(self, model, autobalance=False): super(ComputeLossAuxOTA, self).__init__() device = next(model.parameters()).device # get model device h = model.hyp # hyperparameters # Define criteria BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device)) BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device)) # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3 self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0)) # positive, negative BCE targets # Focal loss g = h['fl_gamma'] # focal loss gamma if g > 0: BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g) det = de_parallel(model).model[-1] # Detect() module self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7 self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance for k in 'na', 'nc', 'nl', 'anchors', 'stride': setattr(self, k, getattr(det, k)) def __call__(self, p, targets, imgs): # predictions, targets, model device = targets.device lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device) bs_aux, as_aux_, gjs_aux, gis_aux, targets_aux, anchors_aux = self.build_targets2(p[:self.nl], targets, imgs) bs, as_, gjs, gis, targets, anchors = self.build_targets(p[:self.nl], targets, imgs) pre_gen_gains_aux = [torch.tensor(pp.shape, device=device)[[3, 2, 3, 2]] for pp in p[:self.nl]] pre_gen_gains = [torch.tensor(pp.shape, device=device)[[3, 2, 3, 2]] for pp in p[:self.nl]] # Losses for i in range(self.nl): # layer index, layer predictions pi = p[i] pi_aux = p[i+self.nl] b, a, gj, gi = bs[i], as_[i], gjs[i], gis[i] # image, anchor, gridy, gridx b_aux, a_aux, gj_aux, gi_aux = bs_aux[i], as_aux_[i], gjs_aux[i], gis_aux[i] # image, anchor, gridy, gridx tobj = torch.zeros_like(pi[..., 0], device=device) # target obj tobj_aux = torch.zeros_like(pi_aux[..., 0], device=device) # target obj n = b.shape[0] # number of targets if n: ps = pi[b, a, gj, gi] # prediction subset corresponding to targets # Regression grid = torch.stack([gi, gj], dim=1) pxy = ps[:, :2].sigmoid() * 2. - 0.5 pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i] pbox = torch.cat((pxy, pwh), 1) # predicted box selected_tbox = targets[i][:, 2:6] * pre_gen_gains[i] selected_tbox[:, :2] -= grid iou = bbox_iou(pbox, selected_tbox, CIoU=True) # iou(prediction, target) lbox += (1.0 - iou).mean() # iou loss # Objectness tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * iou.detach().clamp(0).type(tobj.dtype) # iou ratio # Classification selected_tcls = targets[i][:, 1].long() if self.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], self.cn, device=device) # targets t[range(n), selected_tcls] = self.cp lcls += self.BCEcls(ps[:, 5:], t) # BCE # Append targets to text file # with open('targets.txt', 'a') as file: # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)] n_aux = b_aux.shape[0] # number of targets if n_aux: ps_aux = pi_aux[b_aux, a_aux, gj_aux, gi_aux] # prediction subset corresponding to targets grid_aux = torch.stack([gi_aux, gj_aux], dim=1) pxy_aux = ps_aux[:, :2].sigmoid() * 2. - 0.5 #pxy_aux = ps_aux[:, :2].sigmoid() * 3. - 1. pwh_aux = (ps_aux[:, 2:4].sigmoid() * 2) ** 2 * anchors_aux[i] pbox_aux = torch.cat((pxy_aux, pwh_aux), 1) # predicted box selected_tbox_aux = targets_aux[i][:, 2:6] * pre_gen_gains_aux[i] selected_tbox_aux[:, :2] -= grid_aux iou_aux = bbox_iou(pbox_aux, selected_tbox_aux, CIoU=True) # iou(prediction, target) lbox += 0.25 * (1.0 - iou_aux).mean() # iou loss # Objectness tobj_aux[b_aux, a_aux, gj_aux, gi_aux] = (1.0 - self.gr) + self.gr * iou_aux.detach().clamp(0).type(tobj_aux.dtype) # iou ratio # Classification selected_tcls_aux = targets_aux[i][:, 1].long() if self.nc > 1: # cls loss (only if multiple classes) t_aux = torch.full_like(ps_aux[:, 5:], self.cn, device=device) # targets t_aux[range(n_aux), selected_tcls_aux] = self.cp lcls += 0.25 * self.BCEcls(ps_aux[:, 5:], t_aux) # BCE obji = self.BCEobj(pi[..., 4], tobj) obji_aux = self.BCEobj(pi_aux[..., 4], tobj_aux) lobj += obji * self.balance[i] + 0.25 * obji_aux * self.balance[i] # obj loss if self.autobalance: self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item() if self.autobalance: self.balance = [x / self.balance[self.ssi] for x in self.balance] lbox *= self.hyp['box'] lobj *= self.hyp['obj'] lcls *= self.hyp['cls'] bs = tobj.shape[0] # batch size loss = lbox + lobj + lcls return loss * bs, torch.cat((lbox, lobj, lcls)).detach() def build_targets(self, p, targets, imgs): device = torch.device(targets.device) indices, anch = self.find_3_positive(p, targets) matching_bs = [[] for pp in p] matching_as = [[] for pp in p] matching_gjs = [[] for pp in p] matching_gis = [[] for pp in p] matching_targets = [[] for pp in p] matching_anchs = [[] for pp in p] nl = len(p) for batch_idx in range(p[0].shape[0]): b_idx = targets[:, 0]==batch_idx this_target = targets[b_idx] if this_target.shape[0] == 0: continue txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1] txyxy = xywh2xyxy(txywh) pxyxys = [] p_cls = [] p_obj = [] from_which_layer = [] all_b = [] all_a = [] all_gj = [] all_gi = [] all_anch = [] for i, pi in enumerate(p): b, a, gj, gi = indices[i] idx = (b == batch_idx) b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] all_b.append(b) all_a.append(a) all_gj.append(gj) all_gi.append(gi) all_anch.append(anch[i][idx]) from_which_layer.append((torch.ones(size=(len(b),)) * i).to(device)) fg_pred = pi[b, a, gj, gi] p_obj.append(fg_pred[:, 4:5]) p_cls.append(fg_pred[:, 5:]) grid = torch.stack([gi, gj], dim=1) pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i] #/ 8. #pxy = (fg_pred[:, :2].sigmoid() * 3. - 1. + grid) * self.stride[i] pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i] #/ 8. pxywh = torch.cat([pxy, pwh], dim=-1) pxyxy = xywh2xyxy(pxywh) pxyxys.append(pxyxy) pxyxys = torch.cat(pxyxys, dim=0) if pxyxys.shape[0] == 0: continue p_obj = torch.cat(p_obj, dim=0) p_cls = torch.cat(p_cls, dim=0) from_which_layer = torch.cat(from_which_layer, dim=0) all_b = torch.cat(all_b, dim=0) all_a = torch.cat(all_a, dim=0) all_gj = torch.cat(all_gj, dim=0) all_gi = torch.cat(all_gi, dim=0) all_anch = torch.cat(all_anch, dim=0) pair_wise_iou = box_iou(txyxy, pxyxys) pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) top_k, _ = torch.topk(pair_wise_iou, min(20, pair_wise_iou.shape[1]), dim=1) dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) gt_cls_per_image = ( F.one_hot(this_target[:, 1].to(torch.int64), self.nc) .float() .unsqueeze(1) .repeat(1, pxyxys.shape[0], 1) ) num_gt = this_target.shape[0] cls_preds_ = ( p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() ) y = cls_preds_.sqrt_() pair_wise_cls_loss = F.binary_cross_entropy_with_logits( torch.log(y/(1-y)) , gt_cls_per_image, reduction="none" ).sum(-1) del cls_preds_ cost = ( pair_wise_cls_loss + 3.0 * pair_wise_iou_loss ) matching_matrix = torch.zeros_like(cost) for gt_idx in range(num_gt): _, pos_idx = torch.topk( cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False ) matching_matrix[gt_idx][pos_idx] = 1.0 del top_k, dynamic_ks anchor_matching_gt = matching_matrix.sum(0) if (anchor_matching_gt > 1).sum() > 0: _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0) matching_matrix[:, anchor_matching_gt > 1] *= 0.0 matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 fg_mask_inboxes = matching_matrix.sum(0) > 0.0 matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) from_which_layer = from_which_layer[fg_mask_inboxes] all_b = all_b[fg_mask_inboxes] all_a = all_a[fg_mask_inboxes] all_gj = all_gj[fg_mask_inboxes] all_gi = all_gi[fg_mask_inboxes] all_anch = all_anch[fg_mask_inboxes] this_target = this_target[matched_gt_inds] for i in range(nl): layer_idx = from_which_layer == i matching_bs[i].append(all_b[layer_idx]) matching_as[i].append(all_a[layer_idx]) matching_gjs[i].append(all_gj[layer_idx]) matching_gis[i].append(all_gi[layer_idx]) matching_targets[i].append(this_target[layer_idx]) matching_anchs[i].append(all_anch[layer_idx]) for i in range(nl): if matching_targets[i] != []: matching_bs[i] = torch.cat(matching_bs[i], dim=0) matching_as[i] = torch.cat(matching_as[i], dim=0) matching_gjs[i] = torch.cat(matching_gjs[i], dim=0) matching_gis[i] = torch.cat(matching_gis[i], dim=0) matching_targets[i] = torch.cat(matching_targets[i], dim=0) matching_anchs[i] = torch.cat(matching_anchs[i], dim=0) else: matching_bs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_as[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_gjs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_gis[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_targets[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_anchs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs def build_targets2(self, p, targets, imgs): device = torch.device(targets.device) indices, anch = self.find_5_positive(p, targets) matching_bs = [[] for pp in p] matching_as = [[] for pp in p] matching_gjs = [[] for pp in p] matching_gis = [[] for pp in p] matching_targets = [[] for pp in p] matching_anchs = [[] for pp in p] nl = len(p) for batch_idx in range(p[0].shape[0]): b_idx = targets[:, 0]==batch_idx this_target = targets[b_idx] if this_target.shape[0] == 0: continue txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1] txyxy = xywh2xyxy(txywh) pxyxys = [] p_cls = [] p_obj = [] from_which_layer = [] all_b = [] all_a = [] all_gj = [] all_gi = [] all_anch = [] for i, pi in enumerate(p): b, a, gj, gi = indices[i] idx = (b == batch_idx) b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] all_b.append(b) all_a.append(a) all_gj.append(gj) all_gi.append(gi) all_anch.append(anch[i][idx]) from_which_layer.append((torch.ones(size=(len(b),)) * i).to(device)) fg_pred = pi[b, a, gj, gi] p_obj.append(fg_pred[:, 4:5]) p_cls.append(fg_pred[:, 5:]) grid = torch.stack([gi, gj], dim=1) pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i] #/ 8. #pxy = (fg_pred[:, :2].sigmoid() * 3. - 1. + grid) * self.stride[i] pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i] #/ 8. pxywh = torch.cat([pxy, pwh], dim=-1) pxyxy = xywh2xyxy(pxywh) pxyxys.append(pxyxy) pxyxys = torch.cat(pxyxys, dim=0) if pxyxys.shape[0] == 0: continue p_obj = torch.cat(p_obj, dim=0) p_cls = torch.cat(p_cls, dim=0) from_which_layer = torch.cat(from_which_layer, dim=0) all_b = torch.cat(all_b, dim=0) all_a = torch.cat(all_a, dim=0) all_gj = torch.cat(all_gj, dim=0) all_gi = torch.cat(all_gi, dim=0) all_anch = torch.cat(all_anch, dim=0) pair_wise_iou = box_iou(txyxy, pxyxys) pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) top_k, _ = torch.topk(pair_wise_iou, min(20, pair_wise_iou.shape[1]), dim=1) dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) gt_cls_per_image = ( F.one_hot(this_target[:, 1].to(torch.int64), self.nc) .float() .unsqueeze(1) .repeat(1, pxyxys.shape[0], 1) ) num_gt = this_target.shape[0] cls_preds_ = ( p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() ) y = cls_preds_.sqrt_() pair_wise_cls_loss = F.binary_cross_entropy_with_logits( torch.log(y/(1-y)) , gt_cls_per_image, reduction="none" ).sum(-1) del cls_preds_ cost = ( pair_wise_cls_loss + 3.0 * pair_wise_iou_loss ) matching_matrix = torch.zeros_like(cost) for gt_idx in range(num_gt): _, pos_idx = torch.topk( cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False ) matching_matrix[gt_idx][pos_idx] = 1.0 del top_k, dynamic_ks anchor_matching_gt = matching_matrix.sum(0) if (anchor_matching_gt > 1).sum() > 0: _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0) matching_matrix[:, anchor_matching_gt > 1] *= 0.0 matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 fg_mask_inboxes = matching_matrix.sum(0) > 0.0 matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) from_which_layer = from_which_layer[fg_mask_inboxes] all_b = all_b[fg_mask_inboxes] all_a = all_a[fg_mask_inboxes] all_gj = all_gj[fg_mask_inboxes] all_gi = all_gi[fg_mask_inboxes] all_anch = all_anch[fg_mask_inboxes] this_target = this_target[matched_gt_inds] for i in range(nl): layer_idx = from_which_layer == i matching_bs[i].append(all_b[layer_idx]) matching_as[i].append(all_a[layer_idx]) matching_gjs[i].append(all_gj[layer_idx]) matching_gis[i].append(all_gi[layer_idx]) matching_targets[i].append(this_target[layer_idx]) matching_anchs[i].append(all_anch[layer_idx]) for i in range(nl): if matching_targets[i] != []: matching_bs[i] = torch.cat(matching_bs[i], dim=0) matching_as[i] = torch.cat(matching_as[i], dim=0) matching_gjs[i] = torch.cat(matching_gjs[i], dim=0) matching_gis[i] = torch.cat(matching_gis[i], dim=0) matching_targets[i] = torch.cat(matching_targets[i], dim=0) matching_anchs[i] = torch.cat(matching_anchs[i], dim=0) else: matching_bs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_as[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_gjs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_gis[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_targets[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_anchs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs def find_5_positive(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets indices, anch = [], [] gain = torch.ones(7, device=targets.device).long() # normalized to gridspace gain ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt) targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2) # append anchor indices g = 1.0 # bias off = torch.tensor([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm ], device=targets.device).float() * g # offsets for i in range(self.nl): anchors = self.anchors[i] gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain # Match targets to anchors t = targets * gain if nt: # Matches r = t[:, :, 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1. / r).max(2)[0] < self.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1. < g) & (gxy > 1.)).T l, m = ((gxi % 1. < g) & (gxi > 1.)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Define b, c = t[:, :2].long().T # image, class gxy = t[:, 2:4] # grid xy gwh = t[:, 4:6] # grid wh gij = (gxy - offsets).long() gi, gj = gij.T # grid xy indices # Append a = t[:, 6].long() # anchor indices indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices anch.append(anchors[a]) # anchors return indices, anch def find_3_positive(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets indices, anch = [], [] gain = torch.ones(7, device=targets.device).long() # normalized to gridspace gain ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt) targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2) # append anchor indices g = 0.5 # bias off = torch.tensor([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm ], device=targets.device).float() * g # offsets for i in range(self.nl): anchors = self.anchors[i] gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain # Match targets to anchors t = targets * gain if nt: # Matches r = t[:, :, 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1. / r).max(2)[0] < self.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1. < g) & (gxy > 1.)).T l, m = ((gxi % 1. < g) & (gxi > 1.)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Define b, c = t[:, :2].long().T # image, class gxy = t[:, 2:4] # grid xy gwh = t[:, 4:6] # grid wh gij = (gxy - offsets).long() gi, gj = gij.T # grid xy indices # Append a = t[:, 6].long() # anchor indices indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices anch.append(anchors[a]) # anchors return indices, anch ================================================ FILE: yolo-improve/yolov5-AUX/utils/metrics.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Model validation metrics """ import math import warnings from pathlib import Path import matplotlib.pyplot as plt import numpy as np import torch from utils import TryExcept, threaded def fitness(x): # Model fitness as a weighted combination of metrics w = [0.0, 0.0, 0.1, 0.9] # weights for [P, R, mAP@0.5, mAP@0.5:0.95] return (x[:, :4] * w).sum(1) def smooth(y, f=0.05): # Box filter of fraction f nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd) p = np.ones(nf // 2) # ones padding yp = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded return np.convolve(yp, np.ones(nf) / nf, mode='valid') # y-smoothed def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), eps=1e-16, prefix=''): """ Compute the average precision, given the recall and precision curves. Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. # Arguments tp: True positives (nparray, nx1 or nx10). conf: Objectness value from 0-1 (nparray). pred_cls: Predicted object classes (nparray). target_cls: True object classes (nparray). plot: Plot precision-recall curve at mAP@0.5 save_dir: Plot save directory # Returns The average precision as computed in py-faster-rcnn. """ # Sort by objectness i = np.argsort(-conf) tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] # Find unique classes unique_classes, nt = np.unique(target_cls, return_counts=True) nc = unique_classes.shape[0] # number of classes, number of detections # Create Precision-Recall curve and compute AP for each class px, py = np.linspace(0, 1, 1000), [] # for plotting ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000)) for ci, c in enumerate(unique_classes): i = pred_cls == c n_l = nt[ci] # number of labels n_p = i.sum() # number of predictions if n_p == 0 or n_l == 0: continue # Accumulate FPs and TPs fpc = (1 - tp[i]).cumsum(0) tpc = tp[i].cumsum(0) # Recall recall = tpc / (n_l + eps) # recall curve r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases # Precision precision = tpc / (tpc + fpc) # precision curve p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score # AP from recall-precision curve for j in range(tp.shape[1]): ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) if plot and j == 0: py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 # Compute F1 (harmonic mean of precision and recall) f1 = 2 * p * r / (p + r + eps) names = [v for k, v in names.items() if k in unique_classes] # list: only classes that have data names = dict(enumerate(names)) # to dict if plot: plot_pr_curve(px, py, ap, Path(save_dir) / f'{prefix}PR_curve.png', names) plot_mc_curve(px, f1, Path(save_dir) / f'{prefix}F1_curve.png', names, ylabel='F1') plot_mc_curve(px, p, Path(save_dir) / f'{prefix}P_curve.png', names, ylabel='Precision') plot_mc_curve(px, r, Path(save_dir) / f'{prefix}R_curve.png', names, ylabel='Recall') i = smooth(f1.mean(0), 0.1).argmax() # max F1 index p, r, f1 = p[:, i], r[:, i], f1[:, i] tp = (r * nt).round() # true positives fp = (tp / (p + eps) - tp).round() # false positives return tp, fp, p, r, f1, ap, unique_classes.astype(int) def compute_ap(recall, precision): """ Compute the average precision, given the recall and precision curves # Arguments recall: The recall curve (list) precision: The precision curve (list) # Returns Average precision, precision curve, recall curve """ # Append sentinel values to beginning and end mrec = np.concatenate(([0.0], recall, [1.0])) mpre = np.concatenate(([1.0], precision, [0.0])) # Compute the precision envelope mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) # Integrate area under curve method = 'interp' # methods: 'continuous', 'interp' if method == 'interp': x = np.linspace(0, 1, 101) # 101-point interp (COCO) ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate else: # 'continuous' i = np.where(mrec[1:] != mrec[:-1])[0] # points where x axis (recall) changes ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve return ap, mpre, mrec class ConfusionMatrix: # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix def __init__(self, nc, conf=0.25, iou_thres=0.45): self.matrix = np.zeros((nc + 1, nc + 1)) self.nc = nc # number of classes self.conf = conf self.iou_thres = iou_thres def process_batch(self, detections, labels): """ Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format. Arguments: detections (Array[N, 6]), x1, y1, x2, y2, conf, class labels (Array[M, 5]), class, x1, y1, x2, y2 Returns: None, updates confusion matrix accordingly """ if detections is None: gt_classes = labels.int() for gc in gt_classes: self.matrix[self.nc, gc] += 1 # background FN return detections = detections[detections[:, 4] > self.conf] gt_classes = labels[:, 0].int() detection_classes = detections[:, 5].int() iou = box_iou(labels[:, 1:], detections[:, :4]) x = torch.where(iou > self.iou_thres) if x[0].shape[0]: matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() if x[0].shape[0] > 1: matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 1], return_index=True)[1]] matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] else: matches = np.zeros((0, 3)) n = matches.shape[0] > 0 m0, m1, _ = matches.transpose().astype(int) for i, gc in enumerate(gt_classes): j = m0 == i if n and sum(j) == 1: self.matrix[detection_classes[m1[j]], gc] += 1 # correct else: self.matrix[self.nc, gc] += 1 # true background if n: for i, dc in enumerate(detection_classes): if not any(m1 == i): self.matrix[dc, self.nc] += 1 # predicted background def tp_fp(self): tp = self.matrix.diagonal() # true positives fp = self.matrix.sum(1) - tp # false positives # fn = self.matrix.sum(0) - tp # false negatives (missed detections) return tp[:-1], fp[:-1] # remove background class @TryExcept('WARNING ⚠️ ConfusionMatrix plot failure') def plot(self, normalize=True, save_dir='', names=()): import seaborn as sn array = self.matrix / ((self.matrix.sum(0).reshape(1, -1) + 1E-9) if normalize else 1) # normalize columns array[array < 0.005] = np.nan # don't annotate (would appear as 0.00) fig, ax = plt.subplots(1, 1, figsize=(12, 9), tight_layout=True) nc, nn = self.nc, len(names) # number of classes, names sn.set(font_scale=1.0 if nc < 50 else 0.8) # for label size labels = (0 < nn < 99) and (nn == nc) # apply names to ticklabels ticklabels = (names + ['background']) if labels else 'auto' with warnings.catch_warnings(): warnings.simplefilter('ignore') # suppress empty matrix RuntimeWarning: All-NaN slice encountered sn.heatmap(array, ax=ax, annot=nc < 30, annot_kws={ 'size': 8}, cmap='Blues', fmt='.2f', square=True, vmin=0.0, xticklabels=ticklabels, yticklabels=ticklabels).set_facecolor((1, 1, 1)) ax.set_xlabel('True') ax.set_ylabel('Predicted') ax.set_title('Confusion Matrix') fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250) plt.close(fig) def print(self): for i in range(self.nc + 1): print(' '.join(map(str, self.matrix[i]))) def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7): # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) # Get the coordinates of bounding boxes if xywh: # transform from xywh to xyxy (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ else: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clamp(eps) w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clamp(eps) # Intersection area inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \ (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0) # Union Area union = w1 * h1 + w2 * h2 - inter + eps # IoU iou = inter / union if CIoU or DIoU or GIoU: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) with torch.no_grad(): alpha = v / (v - iou + (1 + eps)) return iou - (rho2 / c2 + v * alpha) # CIoU return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf return iou # IoU def box_iou(box1, box2, eps=1e-7): # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py """ Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format. Arguments: box1 (Tensor[N, 4]) box2 (Tensor[M, 4]) Returns: iou (Tensor[N, M]): the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2 """ # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2) inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2) # IoU = inter / (area1 + area2 - inter) return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps) def bbox_ioa(box1, box2, eps=1e-7): """ Returns the intersection over box2 area given box1, box2. Boxes are x1y1x2y2 box1: np.array of shape(4) box2: np.array of shape(nx4) returns: np.array of shape(n) """ # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1 b2_x1, b2_y1, b2_x2, b2_y2 = box2.T # Intersection area inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \ (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0) # box2 area box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps # Intersection over box2 area return inter_area / box2_area def wh_iou(wh1, wh2, eps=1e-7): # Returns the nxm IoU matrix. wh1 is nx2, wh2 is mx2 wh1 = wh1[:, None] # [N,1,2] wh2 = wh2[None] # [1,M,2] inter = torch.min(wh1, wh2).prod(2) # [N,M] return inter / (wh1.prod(2) + wh2.prod(2) - inter + eps) # iou = inter / (area1 + area2 - inter) # Plots ---------------------------------------------------------------------------------------------------------------- @threaded def plot_pr_curve(px, py, ap, save_dir=Path('pr_curve.png'), names=()): # Precision-recall curve fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) py = np.stack(py, axis=1) if 0 < len(names) < 21: # display per-class legend if < 21 classes for i, y in enumerate(py.T): ax.plot(px, y, linewidth=1, label=f'{names[i]} {ap[i, 0]:.3f}') # plot(recall, precision) else: ax.plot(px, py, linewidth=1, color='grey') # plot(recall, precision) ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean()) ax.set_xlabel('Recall') ax.set_ylabel('Precision') ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') ax.set_title('Precision-Recall Curve') fig.savefig(save_dir, dpi=250) plt.close(fig) @threaded def plot_mc_curve(px, py, save_dir=Path('mc_curve.png'), names=(), xlabel='Confidence', ylabel='Metric'): # Metric-confidence curve fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) if 0 < len(names) < 21: # display per-class legend if < 21 classes for i, y in enumerate(py): ax.plot(px, y, linewidth=1, label=f'{names[i]}') # plot(confidence, metric) else: ax.plot(px, py.T, linewidth=1, color='grey') # plot(confidence, metric) y = smooth(py.mean(0), 0.05) ax.plot(px, y, linewidth=3, color='blue', label=f'all classes {y.max():.2f} at {px[y.argmax()]:.3f}') ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left') ax.set_title(f'{ylabel}-Confidence Curve') fig.savefig(save_dir, dpi=250) plt.close(fig) ================================================ FILE: yolo-improve/yolov5-AUX/utils/plots.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Plotting utils """ import contextlib import math import os from copy import copy from pathlib import Path from urllib.error import URLError import cv2 import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sn import torch from PIL import Image, ImageDraw, ImageFont from utils import TryExcept, threaded from utils.general import (CONFIG_DIR, FONT, LOGGER, check_font, check_requirements, clip_boxes, increment_path, is_ascii, xywh2xyxy, xyxy2xywh) from utils.metrics import fitness from utils.segment.general import scale_image # Settings RANK = int(os.getenv('RANK', -1)) matplotlib.rc('font', **{'size': 11}) matplotlib.use('Agg') # for writing to files only class Colors: # Ultralytics color palette https://ultralytics.com/ def __init__(self): # hex = matplotlib.colors.TABLEAU_COLORS.values() hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7') self.palette = [self.hex2rgb(f'#{c}') for c in hexs] self.n = len(self.palette) def __call__(self, i, bgr=False): c = self.palette[int(i) % self.n] return (c[2], c[1], c[0]) if bgr else c @staticmethod def hex2rgb(h): # rgb order (PIL) return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) colors = Colors() # create instance for 'from utils.plots import colors' def check_pil_font(font=FONT, size=10): # Return a PIL TrueType Font, downloading to CONFIG_DIR if necessary font = Path(font) font = font if font.exists() else (CONFIG_DIR / font.name) try: return ImageFont.truetype(str(font) if font.exists() else font.name, size) except Exception: # download if missing try: check_font(font) return ImageFont.truetype(str(font), size) except TypeError: check_requirements('Pillow>=8.4.0') # known issue https://github.com/ultralytics/yolov5/issues/5374 except URLError: # not online return ImageFont.load_default() class Annotator: # YOLOv5 Annotator for train/val mosaics and jpgs and detect/hub inference annotations def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=False, example='abc'): assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.' non_ascii = not is_ascii(example) # non-latin labels, i.e. asian, arabic, cyrillic self.pil = pil or non_ascii if self.pil: # use PIL self.im = im if isinstance(im, Image.Image) else Image.fromarray(im) self.draw = ImageDraw.Draw(self.im) self.font = check_pil_font(font='Arial.Unicode.ttf' if non_ascii else font, size=font_size or max(round(sum(self.im.size) / 2 * 0.035), 12)) else: # use cv2 self.im = im self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2) # line width def box_label(self, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)): # Add one xyxy box to image with label if self.pil or not is_ascii(label): self.draw.rectangle(box, width=self.lw, outline=color) # box if label: w, h = self.font.getsize(label) # text width, height (WARNING: deprecated) in 9.2.0 # _, _, w, h = self.font.getbbox(label) # text width, height (New) outside = box[1] - h >= 0 # label fits outside box self.draw.rectangle( (box[0], box[1] - h if outside else box[1], box[0] + w + 1, box[1] + 1 if outside else box[1] + h + 1), fill=color, ) # self.draw.text((box[0], box[1]), label, fill=txt_color, font=self.font, anchor='ls') # for PIL>8.0 self.draw.text((box[0], box[1] - h if outside else box[1]), label, fill=txt_color, font=self.font) else: # cv2 p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3])) cv2.rectangle(self.im, p1, p2, color, thickness=self.lw, lineType=cv2.LINE_AA) if label: tf = max(self.lw - 1, 1) # font thickness w, h = cv2.getTextSize(label, 0, fontScale=self.lw / 3, thickness=tf)[0] # text width, height outside = p1[1] - h >= 3 p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3 cv2.rectangle(self.im, p1, p2, color, -1, cv2.LINE_AA) # filled cv2.putText(self.im, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), 0, self.lw / 3, txt_color, thickness=tf, lineType=cv2.LINE_AA) def masks(self, masks, colors, im_gpu, alpha=0.5, retina_masks=False): """Plot masks at once. Args: masks (tensor): predicted masks on cuda, shape: [n, h, w] colors (List[List[Int]]): colors for predicted masks, [[r, g, b] * n] im_gpu (tensor): img is in cuda, shape: [3, h, w], range: [0, 1] alpha (float): mask transparency: 0.0 fully transparent, 1.0 opaque """ if self.pil: # convert to numpy first self.im = np.asarray(self.im).copy() if len(masks) == 0: self.im[:] = im_gpu.permute(1, 2, 0).contiguous().cpu().numpy() * 255 colors = torch.tensor(colors, device=im_gpu.device, dtype=torch.float32) / 255.0 colors = colors[:, None, None] # shape(n,1,1,3) masks = masks.unsqueeze(3) # shape(n,h,w,1) masks_color = masks * (colors * alpha) # shape(n,h,w,3) inv_alph_masks = (1 - masks * alpha).cumprod(0) # shape(n,h,w,1) mcs = (masks_color * inv_alph_masks).sum(0) * 2 # mask color summand shape(n,h,w,3) im_gpu = im_gpu.flip(dims=[0]) # flip channel im_gpu = im_gpu.permute(1, 2, 0).contiguous() # shape(h,w,3) im_gpu = im_gpu * inv_alph_masks[-1] + mcs im_mask = (im_gpu * 255).byte().cpu().numpy() self.im[:] = im_mask if retina_masks else scale_image(im_gpu.shape, im_mask, self.im.shape) if self.pil: # convert im back to PIL and update draw self.fromarray(self.im) def rectangle(self, xy, fill=None, outline=None, width=1): # Add rectangle to image (PIL-only) self.draw.rectangle(xy, fill, outline, width) def text(self, xy, text, txt_color=(255, 255, 255), anchor='top'): # Add text to image (PIL-only) if anchor == 'bottom': # start y from font bottom w, h = self.font.getsize(text) # text width, height xy[1] += 1 - h self.draw.text(xy, text, fill=txt_color, font=self.font) def fromarray(self, im): # Update self.im from a numpy array self.im = im if isinstance(im, Image.Image) else Image.fromarray(im) self.draw = ImageDraw.Draw(self.im) def result(self): # Return annotated image as array return np.asarray(self.im) def feature_visualization(x, module_type, stage, n=32, save_dir=Path('runs/detect/exp')): """ x: Features to be visualized module_type: Module type stage: Module stage within model n: Maximum number of feature maps to plot save_dir: Directory to save results """ if 'Detect' not in module_type: batch, channels, height, width = x.shape # batch, channels, height, width if height > 1 and width > 1: f = save_dir / f"stage{stage}_{module_type.split('.')[-1]}_features.png" # filename blocks = torch.chunk(x[0].cpu(), channels, dim=0) # select batch index 0, block by channels n = min(n, channels) # number of plots fig, ax = plt.subplots(math.ceil(n / 8), 8, tight_layout=True) # 8 rows x n/8 cols ax = ax.ravel() plt.subplots_adjust(wspace=0.05, hspace=0.05) for i in range(n): ax[i].imshow(blocks[i].squeeze()) # cmap='gray' ax[i].axis('off') LOGGER.info(f'Saving {f}... ({n}/{channels})') plt.savefig(f, dpi=300, bbox_inches='tight') plt.close() np.save(str(f.with_suffix('.npy')), x[0].cpu().numpy()) # npy save def hist2d(x, y, n=100): # 2d histogram used in labels.png and evolve.png xedges, yedges = np.linspace(x.min(), x.max(), n), np.linspace(y.min(), y.max(), n) hist, xedges, yedges = np.histogram2d(x, y, (xedges, yedges)) xidx = np.clip(np.digitize(x, xedges) - 1, 0, hist.shape[0] - 1) yidx = np.clip(np.digitize(y, yedges) - 1, 0, hist.shape[1] - 1) return np.log(hist[xidx, yidx]) def butter_lowpass_filtfilt(data, cutoff=1500, fs=50000, order=5): from scipy.signal import butter, filtfilt # https://stackoverflow.com/questions/28536191/how-to-filter-smooth-with-scipy-numpy def butter_lowpass(cutoff, fs, order): nyq = 0.5 * fs normal_cutoff = cutoff / nyq return butter(order, normal_cutoff, btype='low', analog=False) b, a = butter_lowpass(cutoff, fs, order=order) return filtfilt(b, a, data) # forward-backward filter def output_to_target(output, max_det=300): # Convert model output to target format [batch_id, class_id, x, y, w, h, conf] for plotting targets = [] for i, o in enumerate(output): box, conf, cls = o[:max_det, :6].cpu().split((4, 1, 1), 1) j = torch.full((conf.shape[0], 1), i) targets.append(torch.cat((j, cls, xyxy2xywh(box), conf), 1)) return torch.cat(targets, 0).numpy() @threaded def plot_images(images, targets, paths=None, fname='images.jpg', names=None): # Plot image grid with labels if isinstance(images, torch.Tensor): images = images.cpu().float().numpy() if isinstance(targets, torch.Tensor): targets = targets.cpu().numpy() max_size = 1920 # max image size max_subplots = 16 # max image subplots, i.e. 4x4 bs, _, h, w = images.shape # batch size, _, height, width bs = min(bs, max_subplots) # limit plot images ns = np.ceil(bs ** 0.5) # number of subplots (square) if np.max(images[0]) <= 1: images *= 255 # de-normalise (optional) # Build Image mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8) # init for i, im in enumerate(images): if i == max_subplots: # if last batch has fewer images than we expect break x, y = int(w * (i // ns)), int(h * (i % ns)) # block origin im = im.transpose(1, 2, 0) mosaic[y:y + h, x:x + w, :] = im # Resize (optional) scale = max_size / ns / max(h, w) if scale < 1: h = math.ceil(scale * h) w = math.ceil(scale * w) mosaic = cv2.resize(mosaic, tuple(int(x * ns) for x in (w, h))) # Annotate fs = int((h + w) * ns * 0.01) # font size annotator = Annotator(mosaic, line_width=round(fs / 10), font_size=fs, pil=True, example=names) for i in range(i + 1): x, y = int(w * (i // ns)), int(h * (i % ns)) # block origin annotator.rectangle([x, y, x + w, y + h], None, (255, 255, 255), width=2) # borders if paths: annotator.text((x + 5, y + 5), text=Path(paths[i]).name[:40], txt_color=(220, 220, 220)) # filenames if len(targets) > 0: ti = targets[targets[:, 0] == i] # image targets boxes = xywh2xyxy(ti[:, 2:6]).T classes = ti[:, 1].astype('int') labels = ti.shape[1] == 6 # labels if no conf column conf = None if labels else ti[:, 6] # check for confidence presence (label vs pred) if boxes.shape[1]: if boxes.max() <= 1.01: # if normalized with tolerance 0.01 boxes[[0, 2]] *= w # scale to pixels boxes[[1, 3]] *= h elif scale < 1: # absolute coords need scale if image scales boxes *= scale boxes[[0, 2]] += x boxes[[1, 3]] += y for j, box in enumerate(boxes.T.tolist()): cls = classes[j] color = colors(cls) cls = names[cls] if names else cls if labels or conf[j] > 0.25: # 0.25 conf thresh label = f'{cls}' if labels else f'{cls} {conf[j]:.1f}' annotator.box_label(box, label, color=color) annotator.im.save(fname) # save def plot_lr_scheduler(optimizer, scheduler, epochs=300, save_dir=''): # Plot LR simulating training for full epochs optimizer, scheduler = copy(optimizer), copy(scheduler) # do not modify originals y = [] for _ in range(epochs): scheduler.step() y.append(optimizer.param_groups[0]['lr']) plt.plot(y, '.-', label='LR') plt.xlabel('epoch') plt.ylabel('LR') plt.grid() plt.xlim(0, epochs) plt.ylim(0) plt.savefig(Path(save_dir) / 'LR.png', dpi=200) plt.close() def plot_val_txt(): # from utils.plots import *; plot_val() # Plot val.txt histograms x = np.loadtxt('val.txt', dtype=np.float32) box = xyxy2xywh(x[:, :4]) cx, cy = box[:, 0], box[:, 1] fig, ax = plt.subplots(1, 1, figsize=(6, 6), tight_layout=True) ax.hist2d(cx, cy, bins=600, cmax=10, cmin=0) ax.set_aspect('equal') plt.savefig('hist2d.png', dpi=300) fig, ax = plt.subplots(1, 2, figsize=(12, 6), tight_layout=True) ax[0].hist(cx, bins=600) ax[1].hist(cy, bins=600) plt.savefig('hist1d.png', dpi=200) def plot_targets_txt(): # from utils.plots import *; plot_targets_txt() # Plot targets.txt histograms x = np.loadtxt('targets.txt', dtype=np.float32).T s = ['x targets', 'y targets', 'width targets', 'height targets'] fig, ax = plt.subplots(2, 2, figsize=(8, 8), tight_layout=True) ax = ax.ravel() for i in range(4): ax[i].hist(x[i], bins=100, label=f'{x[i].mean():.3g} +/- {x[i].std():.3g}') ax[i].legend() ax[i].set_title(s[i]) plt.savefig('targets.jpg', dpi=200) def plot_val_study(file='', dir='', x=None): # from utils.plots import *; plot_val_study() # Plot file=study.txt generated by val.py (or plot all study*.txt in dir) save_dir = Path(file).parent if file else Path(dir) plot2 = False # plot additional results if plot2: ax = plt.subplots(2, 4, figsize=(10, 6), tight_layout=True)[1].ravel() fig2, ax2 = plt.subplots(1, 1, figsize=(8, 4), tight_layout=True) # for f in [save_dir / f'study_coco_{x}.txt' for x in ['yolov5n6', 'yolov5s6', 'yolov5m6', 'yolov5l6', 'yolov5x6']]: for f in sorted(save_dir.glob('study*.txt')): y = np.loadtxt(f, dtype=np.float32, usecols=[0, 1, 2, 3, 7, 8, 9], ndmin=2).T x = np.arange(y.shape[1]) if x is None else np.array(x) if plot2: s = ['P', 'R', 'mAP@.5', 'mAP@.5:.95', 't_preprocess (ms/img)', 't_inference (ms/img)', 't_NMS (ms/img)'] for i in range(7): ax[i].plot(x, y[i], '.-', linewidth=2, markersize=8) ax[i].set_title(s[i]) j = y[3].argmax() + 1 ax2.plot(y[5, 1:j], y[3, 1:j] * 1E2, '.-', linewidth=2, markersize=8, label=f.stem.replace('study_coco_', '').replace('yolo', 'YOLO')) ax2.plot(1E3 / np.array([209, 140, 97, 58, 35, 18]), [34.6, 40.5, 43.0, 47.5, 49.7, 51.5], 'k.-', linewidth=2, markersize=8, alpha=.25, label='EfficientDet') ax2.grid(alpha=0.2) ax2.set_yticks(np.arange(20, 60, 5)) ax2.set_xlim(0, 57) ax2.set_ylim(25, 55) ax2.set_xlabel('GPU Speed (ms/img)') ax2.set_ylabel('COCO AP val') ax2.legend(loc='lower right') f = save_dir / 'study.png' print(f'Saving {f}...') plt.savefig(f, dpi=300) @TryExcept() # known issue https://github.com/ultralytics/yolov5/issues/5395 def plot_labels(labels, names=(), save_dir=Path('')): # plot dataset labels LOGGER.info(f"Plotting labels to {save_dir / 'labels.jpg'}... ") c, b = labels[:, 0], labels[:, 1:].transpose() # classes, boxes nc = int(c.max() + 1) # number of classes x = pd.DataFrame(b.transpose(), columns=['x', 'y', 'width', 'height']) # seaborn correlogram sn.pairplot(x, corner=True, diag_kind='auto', kind='hist', diag_kws=dict(bins=50), plot_kws=dict(pmax=0.9)) plt.savefig(save_dir / 'labels_correlogram.jpg', dpi=200) plt.close() # matplotlib labels matplotlib.use('svg') # faster ax = plt.subplots(2, 2, figsize=(8, 8), tight_layout=True)[1].ravel() y = ax[0].hist(c, bins=np.linspace(0, nc, nc + 1) - 0.5, rwidth=0.8) with contextlib.suppress(Exception): # color histogram bars by class [y[2].patches[i].set_color([x / 255 for x in colors(i)]) for i in range(nc)] # known issue #3195 ax[0].set_ylabel('instances') if 0 < len(names) < 30: ax[0].set_xticks(range(len(names))) ax[0].set_xticklabels(list(names.values()), rotation=90, fontsize=10) else: ax[0].set_xlabel('classes') sn.histplot(x, x='x', y='y', ax=ax[2], bins=50, pmax=0.9) sn.histplot(x, x='width', y='height', ax=ax[3], bins=50, pmax=0.9) # rectangles labels[:, 1:3] = 0.5 # center labels[:, 1:] = xywh2xyxy(labels[:, 1:]) * 2000 img = Image.fromarray(np.ones((2000, 2000, 3), dtype=np.uint8) * 255) for cls, *box in labels[:1000]: ImageDraw.Draw(img).rectangle(box, width=1, outline=colors(cls)) # plot ax[1].imshow(img) ax[1].axis('off') for a in [0, 1, 2, 3]: for s in ['top', 'right', 'left', 'bottom']: ax[a].spines[s].set_visible(False) plt.savefig(save_dir / 'labels.jpg', dpi=200) matplotlib.use('Agg') plt.close() def imshow_cls(im, labels=None, pred=None, names=None, nmax=25, verbose=False, f=Path('images.jpg')): # Show classification image grid with labels (optional) and predictions (optional) from utils.augmentations import denormalize names = names or [f'class{i}' for i in range(1000)] blocks = torch.chunk(denormalize(im.clone()).cpu().float(), len(im), dim=0) # select batch index 0, block by channels n = min(len(blocks), nmax) # number of plots m = min(8, round(n ** 0.5)) # 8 x 8 default fig, ax = plt.subplots(math.ceil(n / m), m) # 8 rows x n/8 cols ax = ax.ravel() if m > 1 else [ax] # plt.subplots_adjust(wspace=0.05, hspace=0.05) for i in range(n): ax[i].imshow(blocks[i].squeeze().permute((1, 2, 0)).numpy().clip(0.0, 1.0)) ax[i].axis('off') if labels is not None: s = names[labels[i]] + (f'—{names[pred[i]]}' if pred is not None else '') ax[i].set_title(s, fontsize=8, verticalalignment='top') plt.savefig(f, dpi=300, bbox_inches='tight') plt.close() if verbose: LOGGER.info(f'Saving {f}') if labels is not None: LOGGER.info('True: ' + ' '.join(f'{names[i]:3s}' for i in labels[:nmax])) if pred is not None: LOGGER.info('Predicted:' + ' '.join(f'{names[i]:3s}' for i in pred[:nmax])) return f def plot_evolve(evolve_csv='path/to/evolve.csv'): # from utils.plots import *; plot_evolve() # Plot evolve.csv hyp evolution results evolve_csv = Path(evolve_csv) data = pd.read_csv(evolve_csv) keys = [x.strip() for x in data.columns] x = data.values f = fitness(x) j = np.argmax(f) # max fitness index plt.figure(figsize=(10, 12), tight_layout=True) matplotlib.rc('font', **{'size': 8}) print(f'Best results from row {j} of {evolve_csv}:') for i, k in enumerate(keys[7:]): v = x[:, 7 + i] mu = v[j] # best single result plt.subplot(6, 5, i + 1) plt.scatter(v, f, c=hist2d(v, f, 20), cmap='viridis', alpha=.8, edgecolors='none') plt.plot(mu, f.max(), 'k+', markersize=15) plt.title(f'{k} = {mu:.3g}', fontdict={'size': 9}) # limit to 40 characters if i % 5 != 0: plt.yticks([]) print(f'{k:>15}: {mu:.3g}') f = evolve_csv.with_suffix('.png') # filename plt.savefig(f, dpi=200) plt.close() print(f'Saved {f}') def plot_results(file='path/to/results.csv', dir=''): # Plot training results.csv. Usage: from utils.plots import *; plot_results('path/to/results.csv') save_dir = Path(file).parent if file else Path(dir) fig, ax = plt.subplots(2, 5, figsize=(12, 6), tight_layout=True) ax = ax.ravel() files = list(save_dir.glob('results*.csv')) assert len(files), f'No results.csv files found in {save_dir.resolve()}, nothing to plot.' for f in files: try: data = pd.read_csv(f) s = [x.strip() for x in data.columns] x = data.values[:, 0] for i, j in enumerate([1, 2, 3, 4, 5, 8, 9, 10, 6, 7]): y = data.values[:, j].astype('float') # y[y == 0] = np.nan # don't show zero values ax[i].plot(x, y, marker='.', label=f.stem, linewidth=2, markersize=8) ax[i].set_title(s[j], fontsize=12) # if j in [8, 9, 10]: # share train and val loss y axes # ax[i].get_shared_y_axes().join(ax[i], ax[i - 5]) except Exception as e: LOGGER.info(f'Warning: Plotting error for {f}: {e}') ax[1].legend() fig.savefig(save_dir / 'results.png', dpi=200) plt.close() def profile_idetection(start=0, stop=0, labels=(), save_dir=''): # Plot iDetection '*.txt' per-image logs. from utils.plots import *; profile_idetection() ax = plt.subplots(2, 4, figsize=(12, 6), tight_layout=True)[1].ravel() s = ['Images', 'Free Storage (GB)', 'RAM Usage (GB)', 'Battery', 'dt_raw (ms)', 'dt_smooth (ms)', 'real-world FPS'] files = list(Path(save_dir).glob('frames*.txt')) for fi, f in enumerate(files): try: results = np.loadtxt(f, ndmin=2).T[:, 90:-30] # clip first and last rows n = results.shape[1] # number of rows x = np.arange(start, min(stop, n) if stop else n) results = results[:, x] t = (results[0] - results[0].min()) # set t0=0s results[0] = x for i, a in enumerate(ax): if i < len(results): label = labels[fi] if len(labels) else f.stem.replace('frames_', '') a.plot(t, results[i], marker='.', label=label, linewidth=1, markersize=5) a.set_title(s[i]) a.set_xlabel('time (s)') # if fi == len(files) - 1: # a.set_ylim(bottom=0) for side in ['top', 'right']: a.spines[side].set_visible(False) else: a.remove() except Exception as e: print(f'Warning: Plotting error for {f}; {e}') ax[1].legend() plt.savefig(Path(save_dir) / 'idetection_profile.png', dpi=200) def save_one_box(xyxy, im, file=Path('im.jpg'), gain=1.02, pad=10, square=False, BGR=False, save=True): # Save image crop as {file} with crop size multiple {gain} and {pad} pixels. Save and/or return crop xyxy = torch.tensor(xyxy).view(-1, 4) b = xyxy2xywh(xyxy) # boxes if square: b[:, 2:] = b[:, 2:].max(1)[0].unsqueeze(1) # attempt rectangle to square b[:, 2:] = b[:, 2:] * gain + pad # box wh * gain + pad xyxy = xywh2xyxy(b).long() clip_boxes(xyxy, im.shape) crop = im[int(xyxy[0, 1]):int(xyxy[0, 3]), int(xyxy[0, 0]):int(xyxy[0, 2]), ::(1 if BGR else -1)] if save: file.parent.mkdir(parents=True, exist_ok=True) # make directory f = str(increment_path(file).with_suffix('.jpg')) # cv2.imwrite(f, crop) # save BGR, https://github.com/ultralytics/yolov5/issues/7007 chroma subsampling issue Image.fromarray(crop[..., ::-1]).save(f, quality=95, subsampling=0) # save RGB return crop ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/__init__.py ================================================ ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/augmentations.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Image augmentation functions """ import math import random import cv2 import numpy as np from ..augmentations import box_candidates from ..general import resample_segments, segment2box def mixup(im, labels, segments, im2, labels2, segments2): # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 im = (im * r + im2 * (1 - r)).astype(np.uint8) labels = np.concatenate((labels, labels2), 0) segments = np.concatenate((segments, segments2), 0) return im, labels, segments def random_perspective(im, targets=(), segments=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)): # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) # targets = [cls, xyxy] height = im.shape[0] + border[0] * 2 # shape(h,w,c) width = im.shape[1] + border[1] * 2 # Center C = np.eye(3) C[0, 2] = -im.shape[1] / 2 # x translation (pixels) C[1, 2] = -im.shape[0] / 2 # y translation (pixels) # Perspective P = np.eye(3) P[2, 0] = random.uniform(-perspective, perspective) # x perspective (about y) P[2, 1] = random.uniform(-perspective, perspective) # y perspective (about x) # Rotation and Scale R = np.eye(3) a = random.uniform(-degrees, degrees) # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations s = random.uniform(1 - scale, 1 + scale) # s = 2 ** random.uniform(-scale, scale) R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) # Shear S = np.eye(3) S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) # Translation T = np.eye(3) T[0, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * width) # x translation (pixels) T[1, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * height) # y translation (pixels) # Combined rotation matrix M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed if perspective: im = cv2.warpPerspective(im, M, dsize=(width, height), borderValue=(114, 114, 114)) else: # affine im = cv2.warpAffine(im, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) # Visualize # import matplotlib.pyplot as plt # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel() # ax[0].imshow(im[:, :, ::-1]) # base # ax[1].imshow(im2[:, :, ::-1]) # warped # Transform label coordinates n = len(targets) new_segments = [] if n: new = np.zeros((n, 4)) segments = resample_segments(segments) # upsample for i, segment in enumerate(segments): xy = np.ones((len(segment), 3)) xy[:, :2] = segment xy = xy @ M.T # transform xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2]) # perspective rescale or affine # clip new[i] = segment2box(xy, width, height) new_segments.append(xy) # filter candidates i = box_candidates(box1=targets[:, 1:5].T * s, box2=new.T, area_thr=0.01) targets = targets[i] targets[:, 1:5] = new[i] new_segments = np.array(new_segments)[i] return im, targets, new_segments ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/dataloaders.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Dataloaders """ import os import random import cv2 import numpy as np import torch from torch.utils.data import DataLoader, distributed from ..augmentations import augment_hsv, copy_paste, letterbox from ..dataloaders import InfiniteDataLoader, LoadImagesAndLabels, seed_worker from ..general import LOGGER, xyn2xy, xywhn2xyxy, xyxy2xywhn from ..torch_utils import torch_distributed_zero_first from .augmentations import mixup, random_perspective RANK = int(os.getenv('RANK', -1)) def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=None, augment=False, cache=False, pad=0.0, rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix='', shuffle=False, mask_downsample_ratio=1, overlap_mask=False, seed=0): if rect and shuffle: LOGGER.warning('WARNING ⚠️ --rect is incompatible with DataLoader shuffle, setting shuffle=False') shuffle = False with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP dataset = LoadImagesAndLabelsAndMasks( path, imgsz, batch_size, augment=augment, # augmentation hyp=hyp, # hyperparameters rect=rect, # rectangular batches cache_images=cache, single_cls=single_cls, stride=int(stride), pad=pad, image_weights=image_weights, prefix=prefix, downsample_ratio=mask_downsample_ratio, overlap=overlap_mask) batch_size = min(batch_size, len(dataset)) nd = torch.cuda.device_count() # number of CUDA devices nw = min([os.cpu_count() // max(nd, 1), batch_size if batch_size > 1 else 0, workers]) # number of workers sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle) loader = DataLoader if image_weights else InfiniteDataLoader # only DataLoader allows for attribute updates generator = torch.Generator() generator.manual_seed(6148914691236517205 + seed + RANK) return loader( dataset, batch_size=batch_size, shuffle=shuffle and sampler is None, num_workers=nw, sampler=sampler, pin_memory=True, collate_fn=LoadImagesAndLabelsAndMasks.collate_fn4 if quad else LoadImagesAndLabelsAndMasks.collate_fn, worker_init_fn=seed_worker, generator=generator, ), dataset class LoadImagesAndLabelsAndMasks(LoadImagesAndLabels): # for training/testing def __init__( self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, cache_images=False, single_cls=False, stride=32, pad=0, min_items=0, prefix='', downsample_ratio=1, overlap=False, ): super().__init__(path, img_size, batch_size, augment, hyp, rect, image_weights, cache_images, single_cls, stride, pad, min_items, prefix) self.downsample_ratio = downsample_ratio self.overlap = overlap def __getitem__(self, index): index = self.indices[index] # linear, shuffled, or image_weights hyp = self.hyp mosaic = self.mosaic and random.random() < hyp['mosaic'] masks = [] if mosaic: # Load mosaic img, labels, segments = self.load_mosaic(index) shapes = None # MixUp augmentation if random.random() < hyp['mixup']: img, labels, segments = mixup(img, labels, segments, *self.load_mosaic(random.randint(0, self.n - 1))) else: # Load image img, (h0, w0), (h, w) = self.load_image(index) # Letterbox shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling labels = self.labels[index].copy() # [array, array, ....], array.shape=(num_points, 2), xyxyxyxy segments = self.segments[index].copy() if len(segments): for i_s in range(len(segments)): segments[i_s] = xyn2xy( segments[i_s], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1], ) if labels.size: # normalized xywh to pixel xyxy format labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1]) if self.augment: img, labels, segments = random_perspective(img, labels, segments=segments, degrees=hyp['degrees'], translate=hyp['translate'], scale=hyp['scale'], shear=hyp['shear'], perspective=hyp['perspective']) nl = len(labels) # number of labels if nl: labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1e-3) if self.overlap: masks, sorted_idx = polygons2masks_overlap(img.shape[:2], segments, downsample_ratio=self.downsample_ratio) masks = masks[None] # (640, 640) -> (1, 640, 640) labels = labels[sorted_idx] else: masks = polygons2masks(img.shape[:2], segments, color=1, downsample_ratio=self.downsample_ratio) masks = (torch.from_numpy(masks) if len(masks) else torch.zeros(1 if self.overlap else nl, img.shape[0] // self.downsample_ratio, img.shape[1] // self.downsample_ratio)) # TODO: albumentations support if self.augment: # Albumentations # there are some augmentation that won't change boxes and masks, # so just be it for now. img, labels = self.albumentations(img, labels) nl = len(labels) # update after albumentations # HSV color-space augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) # Flip up-down if random.random() < hyp['flipud']: img = np.flipud(img) if nl: labels[:, 2] = 1 - labels[:, 2] masks = torch.flip(masks, dims=[1]) # Flip left-right if random.random() < hyp['fliplr']: img = np.fliplr(img) if nl: labels[:, 1] = 1 - labels[:, 1] masks = torch.flip(masks, dims=[2]) # Cutouts # labels = cutout(img, labels, p=0.5) labels_out = torch.zeros((nl, 6)) if nl: labels_out[:, 1:] = torch.from_numpy(labels) # Convert img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB img = np.ascontiguousarray(img) return (torch.from_numpy(img), labels_out, self.im_files[index], shapes, masks) def load_mosaic(self, index): # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic labels4, segments4 = [], [] s = self.img_size yc, xc = (int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border) # mosaic center x, y # 3 additional image indices indices = [index] + random.choices(self.indices, k=3) # 3 additional image indices for i, index in enumerate(indices): # Load image img, _, (h, w) = self.load_image(index) # place img in img4 if i == 0: # top left img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) elif i == 1: # top right x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h elif i == 2: # bottom left x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) elif i == 3: # bottom right x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] padw = x1a - x1b padh = y1a - y1b labels, segments = self.labels[index].copy(), self.segments[index].copy() if labels.size: labels[:, 1:] = xywhn2xyxy(labels[:, 1:], w, h, padw, padh) # normalized xywh to pixel xyxy format segments = [xyn2xy(x, w, h, padw, padh) for x in segments] labels4.append(labels) segments4.extend(segments) # Concat/clip labels labels4 = np.concatenate(labels4, 0) for x in (labels4[:, 1:], *segments4): np.clip(x, 0, 2 * s, out=x) # clip when using random_perspective() # img4, labels4 = replicate(img4, labels4) # replicate # Augment img4, labels4, segments4 = copy_paste(img4, labels4, segments4, p=self.hyp['copy_paste']) img4, labels4, segments4 = random_perspective(img4, labels4, segments4, degrees=self.hyp['degrees'], translate=self.hyp['translate'], scale=self.hyp['scale'], shear=self.hyp['shear'], perspective=self.hyp['perspective'], border=self.mosaic_border) # border to remove return img4, labels4, segments4 @staticmethod def collate_fn(batch): img, label, path, shapes, masks = zip(*batch) # transposed batched_masks = torch.cat(masks, 0) for i, l in enumerate(label): l[:, 0] = i # add target image index for build_targets() return torch.stack(img, 0), torch.cat(label, 0), path, shapes, batched_masks def polygon2mask(img_size, polygons, color=1, downsample_ratio=1): """ Args: img_size (tuple): The image size. polygons (np.ndarray): [N, M], N is the number of polygons, M is the number of points(Be divided by 2). """ mask = np.zeros(img_size, dtype=np.uint8) polygons = np.asarray(polygons) polygons = polygons.astype(np.int32) shape = polygons.shape polygons = polygons.reshape(shape[0], -1, 2) cv2.fillPoly(mask, polygons, color=color) nh, nw = (img_size[0] // downsample_ratio, img_size[1] // downsample_ratio) # NOTE: fillPoly firstly then resize is trying the keep the same way # of loss calculation when mask-ratio=1. mask = cv2.resize(mask, (nw, nh)) return mask def polygons2masks(img_size, polygons, color, downsample_ratio=1): """ Args: img_size (tuple): The image size. polygons (list[np.ndarray]): each polygon is [N, M], N is the number of polygons, M is the number of points(Be divided by 2). """ masks = [] for si in range(len(polygons)): mask = polygon2mask(img_size, [polygons[si].reshape(-1)], color, downsample_ratio) masks.append(mask) return np.array(masks) def polygons2masks_overlap(img_size, segments, downsample_ratio=1): """Return a (640, 640) overlap mask.""" masks = np.zeros((img_size[0] // downsample_ratio, img_size[1] // downsample_ratio), dtype=np.int32 if len(segments) > 255 else np.uint8) areas = [] ms = [] for si in range(len(segments)): mask = polygon2mask( img_size, [segments[si].reshape(-1)], downsample_ratio=downsample_ratio, color=1, ) ms.append(mask) areas.append(mask.sum()) areas = np.asarray(areas) index = np.argsort(-areas) ms = np.array(ms)[index] for i in range(len(segments)): mask = ms[i] * (i + 1) masks = masks + mask masks = np.clip(masks, a_min=0, a_max=i + 1) return masks, index ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/general.py ================================================ import cv2 import numpy as np import torch import torch.nn.functional as F def crop_mask(masks, boxes): """ "Crop" predicted masks by zeroing out everything not in the predicted bbox. Vectorized by Chong (thanks Chong). Args: - masks should be a size [h, w, n] tensor of masks - boxes should be a size [n, 4] tensor of bbox coords in relative point form """ n, h, w = masks.shape x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) def process_mask_upsample(protos, masks_in, bboxes, shape): """ Crop after upsample. protos: [mask_dim, mask_h, mask_w] masks_in: [n, mask_dim], n is number of masks after nms bboxes: [n, 4], n is number of masks after nms shape: input_image_size, (h, w) return: h, w, n """ c, mh, mw = protos.shape # CHW masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW masks = crop_mask(masks, bboxes) # CHW return masks.gt_(0.5) def process_mask(protos, masks_in, bboxes, shape, upsample=False): """ Crop before upsample. proto_out: [mask_dim, mask_h, mask_w] out_masks: [n, mask_dim], n is number of masks after nms bboxes: [n, 4], n is number of masks after nms shape:input_image_size, (h, w) return: h, w, n """ c, mh, mw = protos.shape # CHW ih, iw = shape masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW downsampled_bboxes = bboxes.clone() downsampled_bboxes[:, 0] *= mw / iw downsampled_bboxes[:, 2] *= mw / iw downsampled_bboxes[:, 3] *= mh / ih downsampled_bboxes[:, 1] *= mh / ih masks = crop_mask(masks, downsampled_bboxes) # CHW if upsample: masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW return masks.gt_(0.5) def process_mask_native(protos, masks_in, bboxes, shape): """ Crop after upsample. protos: [mask_dim, mask_h, mask_w] masks_in: [n, mask_dim], n is number of masks after nms bboxes: [n, 4], n is number of masks after nms shape: input_image_size, (h, w) return: h, w, n """ c, mh, mw = protos.shape # CHW masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) gain = min(mh / shape[0], mw / shape[1]) # gain = old / new pad = (mw - shape[1] * gain) / 2, (mh - shape[0] * gain) / 2 # wh padding top, left = int(pad[1]), int(pad[0]) # y, x bottom, right = int(mh - pad[1]), int(mw - pad[0]) masks = masks[:, top:bottom, left:right] masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW masks = crop_mask(masks, bboxes) # CHW return masks.gt_(0.5) def scale_image(im1_shape, masks, im0_shape, ratio_pad=None): """ img1_shape: model input shape, [h, w] img0_shape: origin pic shape, [h, w, 3] masks: [h, w, num] """ # Rescale coordinates (xyxy) from im1_shape to im0_shape if ratio_pad is None: # calculate from im0_shape gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding else: pad = ratio_pad[1] top, left = int(pad[1]), int(pad[0]) # y, x bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0]) if len(masks.shape) < 2: raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}') masks = masks[top:bottom, left:right] # masks = masks.permute(2, 0, 1).contiguous() # masks = F.interpolate(masks[None], im0_shape[:2], mode='bilinear', align_corners=False)[0] # masks = masks.permute(1, 2, 0).contiguous() masks = cv2.resize(masks, (im0_shape[1], im0_shape[0])) if len(masks.shape) == 2: masks = masks[:, :, None] return masks def mask_iou(mask1, mask2, eps=1e-7): """ mask1: [N, n] m1 means number of predicted objects mask2: [M, n] m2 means number of gt objects Note: n means image_w x image_h return: masks iou, [N, M] """ intersection = torch.matmul(mask1, mask2.t()).clamp(0) union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection # (area1 + area2) - intersection return intersection / (union + eps) def masks_iou(mask1, mask2, eps=1e-7): """ mask1: [N, n] m1 means number of predicted objects mask2: [N, n] m2 means number of gt objects Note: n means image_w x image_h return: masks iou, (N, ) """ intersection = (mask1 * mask2).sum(1).clamp(0) # (N, ) union = (mask1.sum(1) + mask2.sum(1))[None] - intersection # (area1 + area2) - intersection return intersection / (union + eps) def masks2segments(masks, strategy='largest'): # Convert masks(n,160,160) into segments(n,xy) segments = [] for x in masks.int().cpu().numpy().astype('uint8'): c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0] if c: if strategy == 'concat': # concatenate all segments c = np.concatenate([x.reshape(-1, 2) for x in c]) elif strategy == 'largest': # select largest segment c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2) else: c = np.zeros((0, 2)) # no segments found segments.append(c.astype('float32')) return segments ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/loss.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from ..general import xywh2xyxy from ..loss import FocalLoss, smooth_BCE from ..metrics import bbox_iou from ..torch_utils import de_parallel from .general import crop_mask class ComputeLoss: # Compute losses def __init__(self, model, autobalance=False, overlap=False): self.sort_obj_iou = False self.overlap = overlap device = next(model.parameters()).device # get model device h = model.hyp # hyperparameters self.device = device # Define criteria BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device)) BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device)) # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3 self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0)) # positive, negative BCE targets # Focal loss g = h['fl_gamma'] # focal loss gamma if g > 0: BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g) m = de_parallel(model).model[-1] # Detect() module self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02]) # P3-P7 self.ssi = list(m.stride).index(16) if autobalance else 0 # stride 16 index self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance self.na = m.na # number of anchors self.nc = m.nc # number of classes self.nl = m.nl # number of layers self.nm = m.nm # number of masks self.anchors = m.anchors self.device = device def __call__(self, preds, targets, masks): # predictions, targets, model p, proto = preds bs, nm, mask_h, mask_w = proto.shape # batch size, number of masks, mask height, mask width lcls = torch.zeros(1, device=self.device) lbox = torch.zeros(1, device=self.device) lobj = torch.zeros(1, device=self.device) lseg = torch.zeros(1, device=self.device) tcls, tbox, indices, anchors, tidxs, xywhn = self.build_targets(p, targets) # targets # Losses for i, pi in enumerate(p): # layer index, layer predictions b, a, gj, gi = indices[i] # image, anchor, gridy, gridx tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device) # target obj n = b.shape[0] # number of targets if n: pxy, pwh, _, pcls, pmask = pi[b, a, gj, gi].split((2, 2, 1, self.nc, nm), 1) # subset of predictions # Box regression pxy = pxy.sigmoid() * 2 - 0.5 pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i] pbox = torch.cat((pxy, pwh), 1) # predicted box iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze() # iou(prediction, target) lbox += (1.0 - iou).mean() # iou loss # Objectness iou = iou.detach().clamp(0).type(tobj.dtype) if self.sort_obj_iou: j = iou.argsort() b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j] if self.gr < 1: iou = (1.0 - self.gr) + self.gr * iou tobj[b, a, gj, gi] = iou # iou ratio # Classification if self.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(pcls, self.cn, device=self.device) # targets t[range(n), tcls[i]] = self.cp lcls += self.BCEcls(pcls, t) # BCE # Mask regression if tuple(masks.shape[-2:]) != (mask_h, mask_w): # downsample masks = F.interpolate(masks[None], (mask_h, mask_w), mode='nearest')[0] marea = xywhn[i][:, 2:].prod(1) # mask width, height normalized mxyxy = xywh2xyxy(xywhn[i] * torch.tensor([mask_w, mask_h, mask_w, mask_h], device=self.device)) for bi in b.unique(): j = b == bi # matching index if self.overlap: mask_gti = torch.where(masks[bi][None] == tidxs[i][j].view(-1, 1, 1), 1.0, 0.0) else: mask_gti = masks[tidxs[i]][j] lseg += self.single_mask_loss(mask_gti, pmask[j], proto[bi], mxyxy[j], marea[j]) obji = self.BCEobj(pi[..., 4], tobj) lobj += obji * self.balance[i] # obj loss if self.autobalance: self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item() if self.autobalance: self.balance = [x / self.balance[self.ssi] for x in self.balance] lbox *= self.hyp['box'] lobj *= self.hyp['obj'] lcls *= self.hyp['cls'] lseg *= self.hyp['box'] / bs loss = lbox + lobj + lcls + lseg return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach() def single_mask_loss(self, gt_mask, pred, proto, xyxy, area): # Mask loss for one image pred_mask = (pred @ proto.view(self.nm, -1)).view(-1, *proto.shape[1:]) # (n,32) @ (32,80,80) -> (n,80,80) loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction='none') return (crop_mask(loss, xyxy).mean(dim=(1, 2)) / area).mean() def build_targets(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch, tidxs, xywhn = [], [], [], [], [], [] gain = torch.ones(8, device=self.device) # normalized to gridspace gain ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt) if self.overlap: batch = p[0].shape[0] ti = [] for i in range(batch): num = (targets[:, 0] == i).sum() # find number of targets of each image ti.append(torch.arange(num, device=self.device).float().view(1, num).repeat(na, 1) + 1) # (na, num) ti = torch.cat(ti, 1) # (na, nt) else: ti = torch.arange(nt, device=self.device).float().view(1, nt).repeat(na, 1) targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None], ti[..., None]), 2) # append anchor indices g = 0.5 # bias off = torch.tensor( [ [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm ], device=self.device).float() * g # offsets for i in range(self.nl): anchors, shape = self.anchors[i], p[i].shape gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain # Match targets to anchors t = targets * gain # shape(3,n,7) if nt: # Matches r = t[..., 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1 < g) & (gxy > 1)).T l, m = ((gxi % 1 < g) & (gxi > 1)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Define bc, gxy, gwh, at = t.chunk(4, 1) # (image, class), grid xy, grid wh, anchors (a, tidx), (b, c) = at.long().T, bc.long().T # anchors, image, class gij = (gxy - offsets).long() gi, gj = gij.T # grid indices # Append indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1))) # image, anchor, grid tbox.append(torch.cat((gxy - gij, gwh), 1)) # box anch.append(anchors[a]) # anchors tcls.append(c) # class tidxs.append(tidx) xywhn.append(torch.cat((gxy, gwh), 1) / gain[2:6]) # xywh normalized return tcls, tbox, indices, anch, tidxs, xywhn ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/metrics.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Model validation metrics """ import numpy as np from ..metrics import ap_per_class def fitness(x): # Model fitness as a weighted combination of metrics w = [0.0, 0.0, 0.1, 0.9, 0.0, 0.0, 0.1, 0.9] return (x[:, :8] * w).sum(1) def ap_per_class_box_and_mask( tp_m, tp_b, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), ): """ Args: tp_b: tp of boxes. tp_m: tp of masks. other arguments see `func: ap_per_class`. """ results_boxes = ap_per_class(tp_b, conf, pred_cls, target_cls, plot=plot, save_dir=save_dir, names=names, prefix='Box')[2:] results_masks = ap_per_class(tp_m, conf, pred_cls, target_cls, plot=plot, save_dir=save_dir, names=names, prefix='Mask')[2:] results = { 'boxes': { 'p': results_boxes[0], 'r': results_boxes[1], 'ap': results_boxes[3], 'f1': results_boxes[2], 'ap_class': results_boxes[4]}, 'masks': { 'p': results_masks[0], 'r': results_masks[1], 'ap': results_masks[3], 'f1': results_masks[2], 'ap_class': results_masks[4]}} return results class Metric: def __init__(self) -> None: self.p = [] # (nc, ) self.r = [] # (nc, ) self.f1 = [] # (nc, ) self.all_ap = [] # (nc, 10) self.ap_class_index = [] # (nc, ) @property def ap50(self): """AP@0.5 of all classes. Return: (nc, ) or []. """ return self.all_ap[:, 0] if len(self.all_ap) else [] @property def ap(self): """AP@0.5:0.95 Return: (nc, ) or []. """ return self.all_ap.mean(1) if len(self.all_ap) else [] @property def mp(self): """mean precision of all classes. Return: float. """ return self.p.mean() if len(self.p) else 0.0 @property def mr(self): """mean recall of all classes. Return: float. """ return self.r.mean() if len(self.r) else 0.0 @property def map50(self): """Mean AP@0.5 of all classes. Return: float. """ return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0 @property def map(self): """Mean AP@0.5:0.95 of all classes. Return: float. """ return self.all_ap.mean() if len(self.all_ap) else 0.0 def mean_results(self): """Mean of results, return mp, mr, map50, map""" return (self.mp, self.mr, self.map50, self.map) def class_result(self, i): """class-aware result, return p[i], r[i], ap50[i], ap[i]""" return (self.p[i], self.r[i], self.ap50[i], self.ap[i]) def get_maps(self, nc): maps = np.zeros(nc) + self.map for i, c in enumerate(self.ap_class_index): maps[c] = self.ap[i] return maps def update(self, results): """ Args: results: tuple(p, r, ap, f1, ap_class) """ p, r, all_ap, f1, ap_class_index = results self.p = p self.r = r self.all_ap = all_ap self.f1 = f1 self.ap_class_index = ap_class_index class Metrics: """Metric for boxes and masks.""" def __init__(self) -> None: self.metric_box = Metric() self.metric_mask = Metric() def update(self, results): """ Args: results: Dict{'boxes': Dict{}, 'masks': Dict{}} """ self.metric_box.update(list(results['boxes'].values())) self.metric_mask.update(list(results['masks'].values())) def mean_results(self): return self.metric_box.mean_results() + self.metric_mask.mean_results() def class_result(self, i): return self.metric_box.class_result(i) + self.metric_mask.class_result(i) def get_maps(self, nc): return self.metric_box.get_maps(nc) + self.metric_mask.get_maps(nc) @property def ap_class_index(self): # boxes and masks have the same ap_class_index return self.metric_box.ap_class_index KEYS = [ 'train/box_loss', 'train/seg_loss', # train loss 'train/obj_loss', 'train/cls_loss', 'metrics/precision(B)', 'metrics/recall(B)', 'metrics/mAP_0.5(B)', 'metrics/mAP_0.5:0.95(B)', # metrics 'metrics/precision(M)', 'metrics/recall(M)', 'metrics/mAP_0.5(M)', 'metrics/mAP_0.5:0.95(M)', # metrics 'val/box_loss', 'val/seg_loss', # val loss 'val/obj_loss', 'val/cls_loss', 'x/lr0', 'x/lr1', 'x/lr2',] BEST_KEYS = [ 'best/epoch', 'best/precision(B)', 'best/recall(B)', 'best/mAP_0.5(B)', 'best/mAP_0.5:0.95(B)', 'best/precision(M)', 'best/recall(M)', 'best/mAP_0.5(M)', 'best/mAP_0.5:0.95(M)',] ================================================ FILE: yolo-improve/yolov5-AUX/utils/segment/plots.py ================================================ import contextlib import math from pathlib import Path import cv2 import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch from .. import threaded from ..general import xywh2xyxy from ..plots import Annotator, colors @threaded def plot_images_and_masks(images, targets, masks, paths=None, fname='images.jpg', names=None): # Plot image grid with labels if isinstance(images, torch.Tensor): images = images.cpu().float().numpy() if isinstance(targets, torch.Tensor): targets = targets.cpu().numpy() if isinstance(masks, torch.Tensor): masks = masks.cpu().numpy().astype(int) max_size = 1920 # max image size max_subplots = 16 # max image subplots, i.e. 4x4 bs, _, h, w = images.shape # batch size, _, height, width bs = min(bs, max_subplots) # limit plot images ns = np.ceil(bs ** 0.5) # number of subplots (square) if np.max(images[0]) <= 1: images *= 255 # de-normalise (optional) # Build Image mosaic = np.full((int(ns * h), int(ns * w), 3), 255, dtype=np.uint8) # init for i, im in enumerate(images): if i == max_subplots: # if last batch has fewer images than we expect break x, y = int(w * (i // ns)), int(h * (i % ns)) # block origin im = im.transpose(1, 2, 0) mosaic[y:y + h, x:x + w, :] = im # Resize (optional) scale = max_size / ns / max(h, w) if scale < 1: h = math.ceil(scale * h) w = math.ceil(scale * w) mosaic = cv2.resize(mosaic, tuple(int(x * ns) for x in (w, h))) # Annotate fs = int((h + w) * ns * 0.01) # font size annotator = Annotator(mosaic, line_width=round(fs / 10), font_size=fs, pil=True, example=names) for i in range(i + 1): x, y = int(w * (i // ns)), int(h * (i % ns)) # block origin annotator.rectangle([x, y, x + w, y + h], None, (255, 255, 255), width=2) # borders if paths: annotator.text((x + 5, y + 5 + h), text=Path(paths[i]).name[:40], txt_color=(220, 220, 220)) # filenames if len(targets) > 0: idx = targets[:, 0] == i ti = targets[idx] # image targets boxes = xywh2xyxy(ti[:, 2:6]).T classes = ti[:, 1].astype('int') labels = ti.shape[1] == 6 # labels if no conf column conf = None if labels else ti[:, 6] # check for confidence presence (label vs pred) if boxes.shape[1]: if boxes.max() <= 1.01: # if normalized with tolerance 0.01 boxes[[0, 2]] *= w # scale to pixels boxes[[1, 3]] *= h elif scale < 1: # absolute coords need scale if image scales boxes *= scale boxes[[0, 2]] += x boxes[[1, 3]] += y for j, box in enumerate(boxes.T.tolist()): cls = classes[j] color = colors(cls) cls = names[cls] if names else cls if labels or conf[j] > 0.25: # 0.25 conf thresh label = f'{cls}' if labels else f'{cls} {conf[j]:.1f}' annotator.box_label(box, label, color=color) # Plot masks if len(masks): if masks.max() > 1.0: # mean that masks are overlap image_masks = masks[[i]] # (1, 640, 640) nl = len(ti) index = np.arange(nl).reshape(nl, 1, 1) + 1 image_masks = np.repeat(image_masks, nl, axis=0) image_masks = np.where(image_masks == index, 1.0, 0.0) else: image_masks = masks[idx] im = np.asarray(annotator.im).copy() for j, box in enumerate(boxes.T.tolist()): if labels or conf[j] > 0.25: # 0.25 conf thresh color = colors(classes[j]) mh, mw = image_masks[j].shape if mh != h or mw != w: mask = image_masks[j].astype(np.uint8) mask = cv2.resize(mask, (w, h)) mask = mask.astype(bool) else: mask = image_masks[j].astype(bool) with contextlib.suppress(Exception): im[y:y + h, x:x + w, :][mask] = im[y:y + h, x:x + w, :][mask] * 0.4 + np.array(color) * 0.6 annotator.fromarray(im) annotator.im.save(fname) # save def plot_results_with_masks(file='path/to/results.csv', dir='', best=True): # Plot training results.csv. Usage: from utils.plots import *; plot_results('path/to/results.csv') save_dir = Path(file).parent if file else Path(dir) fig, ax = plt.subplots(2, 8, figsize=(18, 6), tight_layout=True) ax = ax.ravel() files = list(save_dir.glob('results*.csv')) assert len(files), f'No results.csv files found in {save_dir.resolve()}, nothing to plot.' for f in files: try: data = pd.read_csv(f) index = np.argmax(0.9 * data.values[:, 8] + 0.1 * data.values[:, 7] + 0.9 * data.values[:, 12] + 0.1 * data.values[:, 11]) s = [x.strip() for x in data.columns] x = data.values[:, 0] for i, j in enumerate([1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 7, 8, 11, 12]): y = data.values[:, j] # y[y == 0] = np.nan # don't show zero values ax[i].plot(x, y, marker='.', label=f.stem, linewidth=2, markersize=2) if best: # best ax[i].scatter(index, y[index], color='r', label=f'best:{index}', marker='*', linewidth=3) ax[i].set_title(s[j] + f'\n{round(y[index], 5)}') else: # last ax[i].scatter(x[-1], y[-1], color='r', label='last', marker='*', linewidth=3) ax[i].set_title(s[j] + f'\n{round(y[-1], 5)}') # if j in [8, 9, 10]: # share train and val loss y axes # ax[i].get_shared_y_axes().join(ax[i], ax[i - 5]) except Exception as e: print(f'Warning: Plotting error for {f}: {e}') ax[1].legend() fig.savefig(save_dir / 'results.png', dpi=200) plt.close() ================================================ FILE: yolo-improve/yolov5-AUX/utils/torch_utils.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ PyTorch utils """ import math import os import platform import subprocess import time import warnings from contextlib import contextmanager from copy import deepcopy from pathlib import Path import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from torch.nn.parallel import DistributedDataParallel as DDP from utils.general import LOGGER, check_version, colorstr, file_date, git_describe LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) try: import thop # for FLOPs computation except ImportError: thop = None # Suppress PyTorch warnings warnings.filterwarnings('ignore', message='User provided device_type of \'cuda\', but CUDA is not available. Disabling') warnings.filterwarnings('ignore', category=UserWarning) def smart_inference_mode(torch_1_9=check_version(torch.__version__, '1.9.0')): # Applies torch.inference_mode() decorator if torch>=1.9.0 else torch.no_grad() decorator def decorate(fn): return (torch.inference_mode if torch_1_9 else torch.no_grad)()(fn) return decorate def smartCrossEntropyLoss(label_smoothing=0.0): # Returns nn.CrossEntropyLoss with label smoothing enabled for torch>=1.10.0 if check_version(torch.__version__, '1.10.0'): return nn.CrossEntropyLoss(label_smoothing=label_smoothing) if label_smoothing > 0: LOGGER.warning(f'WARNING ⚠️ label smoothing {label_smoothing} requires torch>=1.10.0') return nn.CrossEntropyLoss() def smart_DDP(model): # Model DDP creation with checks assert not check_version(torch.__version__, '1.12.0', pinned=True), \ 'torch==1.12.0 torchvision==0.13.0 DDP training is not supported due to a known issue. ' \ 'Please upgrade or downgrade torch to use DDP. See https://github.com/ultralytics/yolov5/issues/8395' if check_version(torch.__version__, '1.11.0'): return DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK, static_graph=True) else: return DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK) def reshape_classifier_output(model, n=1000): # Update a TorchVision classification model to class count 'n' if required from models.common import Classify name, m = list((model.model if hasattr(model, 'model') else model).named_children())[-1] # last module if isinstance(m, Classify): # YOLOv5 Classify() head if m.linear.out_features != n: m.linear = nn.Linear(m.linear.in_features, n) elif isinstance(m, nn.Linear): # ResNet, EfficientNet if m.out_features != n: setattr(model, name, nn.Linear(m.in_features, n)) elif isinstance(m, nn.Sequential): types = [type(x) for x in m] if nn.Linear in types: i = types.index(nn.Linear) # nn.Linear index if m[i].out_features != n: m[i] = nn.Linear(m[i].in_features, n) elif nn.Conv2d in types: i = types.index(nn.Conv2d) # nn.Conv2d index if m[i].out_channels != n: m[i] = nn.Conv2d(m[i].in_channels, n, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None) @contextmanager def torch_distributed_zero_first(local_rank: int): # Decorator to make all processes in distributed training wait for each local_master to do something if local_rank not in [-1, 0]: dist.barrier(device_ids=[local_rank]) yield if local_rank == 0: dist.barrier(device_ids=[0]) def device_count(): # Returns number of CUDA devices available. Safe version of torch.cuda.device_count(). Supports Linux and Windows assert platform.system() in ('Linux', 'Windows'), 'device_count() only supported on Linux or Windows' try: cmd = 'nvidia-smi -L | wc -l' if platform.system() == 'Linux' else 'nvidia-smi -L | find /c /v ""' # Windows return int(subprocess.run(cmd, shell=True, capture_output=True, check=True).stdout.decode().split()[-1]) except Exception: return 0 def select_device(device='', batch_size=0, newline=True): # device = None or 'cpu' or 0 or '0' or '0,1,2,3' s = f'YOLOv5 🚀 {git_describe() or file_date()} Python-{platform.python_version()} torch-{torch.__version__} ' device = str(device).strip().lower().replace('cuda:', '').replace('none', '') # to string, 'cuda:0' to '0' cpu = device == 'cpu' mps = device == 'mps' # Apple Metal Performance Shaders (MPS) if cpu or mps: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # force torch.cuda.is_available() = False elif device: # non-cpu device requested os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable - must be before assert is_available() assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device.replace(',', '')), \ f"Invalid CUDA '--device {device}' requested, use '--device cpu' or pass valid CUDA device(s)" if not cpu and not mps and torch.cuda.is_available(): # prefer GPU if available devices = device.split(',') if device else '0' # range(torch.cuda.device_count()) # i.e. 0,1,6,7 n = len(devices) # device count if n > 1 and batch_size > 0: # check batch_size is divisible by device_count assert batch_size % n == 0, f'batch-size {batch_size} not multiple of GPU count {n}' space = ' ' * (len(s) + 1) for i, d in enumerate(devices): p = torch.cuda.get_device_properties(i) s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n" # bytes to MB arg = 'cuda:0' elif mps and getattr(torch, 'has_mps', False) and torch.backends.mps.is_available(): # prefer MPS if available s += 'MPS\n' arg = 'mps' else: # revert to CPU s += 'CPU\n' arg = 'cpu' if not newline: s = s.rstrip() LOGGER.info(s) return torch.device(arg) def time_sync(): # PyTorch-accurate time if torch.cuda.is_available(): torch.cuda.synchronize() return time.time() def profile(input, ops, n=10, device=None): """ YOLOv5 speed/memory/FLOPs profiler Usage: input = torch.randn(16, 3, 640, 640) m1 = lambda x: x * torch.sigmoid(x) m2 = nn.SiLU() profile(input, [m1, m2], n=100) # profile over 100 iterations """ results = [] if not isinstance(device, torch.device): device = select_device(device) print(f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}" f"{'input':>24s}{'output':>24s}") for x in input if isinstance(input, list) else [input]: x = x.to(device) x.requires_grad = True for m in ops if isinstance(ops, list) else [ops]: m = m.to(device) if hasattr(m, 'to') else m # device m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m tf, tb, t = 0, 0, [0, 0, 0] # dt forward, backward try: flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 # GFLOPs except Exception: flops = 0 try: for _ in range(n): t[0] = time_sync() y = m(x) t[1] = time_sync() try: _ = (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward() t[2] = time_sync() except Exception: # no backward method # print(e) # for debug t[2] = float('nan') tf += (t[1] - t[0]) * 1000 / n # ms per op forward tb += (t[2] - t[1]) * 1000 / n # ms per op backward mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0 # (GB) s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else 'list' for x in (x, y)) # shapes p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0 # parameters print(f'{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}') results.append([p, flops, mem, tf, tb, s_in, s_out]) except Exception as e: print(e) results.append(None) torch.cuda.empty_cache() return results def is_parallel(model): # Returns True if model is of type DP or DDP return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) def de_parallel(model): # De-parallelize a model: returns single-GPU model if model is of type DP or DDP return model.module if is_parallel(model) else model def initialize_weights(model): for m in model.modules(): t = type(m) if t is nn.Conv2d: pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif t is nn.BatchNorm2d: m.eps = 1e-3 m.momentum = 0.03 elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]: m.inplace = True def find_modules(model, mclass=nn.Conv2d): # Finds layer indices matching module class 'mclass' return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)] def sparsity(model): # Return global model sparsity a, b = 0, 0 for p in model.parameters(): a += p.numel() b += (p == 0).sum() return b / a def prune(model, amount=0.3): # Prune model to requested global sparsity import torch.nn.utils.prune as prune for name, m in model.named_modules(): if isinstance(m, nn.Conv2d): prune.l1_unstructured(m, name='weight', amount=amount) # prune prune.remove(m, 'weight') # make permanent LOGGER.info(f'Model pruned to {sparsity(model):.3g} global sparsity') def fuse_conv_and_bn(conv, bn): # Fuse Conv2d() and BatchNorm2d() layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ fusedconv = nn.Conv2d(conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, dilation=conv.dilation, groups=conv.groups, bias=True).requires_grad_(False).to(conv.weight.device) # Prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) # Prepare spatial bias b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv def model_info(model, verbose=False, imgsz=640): # Model information. img_size may be int or list, i.e. img_size=640 or img_size=[640, 320] n_p = sum(x.numel() for x in model.parameters()) # number parameters n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients if verbose: print(f"{'layer':>5} {'name':>40} {'gradient':>9} {'parameters':>12} {'shape':>20} {'mu':>10} {'sigma':>10}") for i, (name, p) in enumerate(model.named_parameters()): name = name.replace('module_list.', '') print('%5g %40s %9s %12g %20s %10.3g %10.3g' % (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) try: # FLOPs p = next(model.parameters()) stride = max(int(model.stride.max()), 32) if hasattr(model, 'stride') else 32 # max stride im = torch.empty((1, p.shape[1], stride, stride), device=p.device) # input image in BCHW format flops = thop.profile(deepcopy(model), inputs=(im,), verbose=False)[0] / 1E9 * 2 # stride GFLOPs imgsz = imgsz if isinstance(imgsz, list) else [imgsz, imgsz] # expand if int/float fs = f', {flops * imgsz[0] / stride * imgsz[1] / stride:.1f} GFLOPs' # 640x640 GFLOPs except Exception: fs = '' name = Path(model.yaml_file).stem.replace('yolov5', 'YOLOv5') if hasattr(model, 'yaml_file') else 'Model' LOGGER.info(f'{name} summary: {len(list(model.modules()))} layers, {n_p} parameters, {n_g} gradients{fs}') def scale_img(img, ratio=1.0, same_shape=False, gs=32): # img(16,3,256,416) # Scales img(bs,3,y,x) by ratio constrained to gs-multiple if ratio == 1.0: return img h, w = img.shape[2:] s = (int(h * ratio), int(w * ratio)) # new size img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize if not same_shape: # pad/crop img h, w = (math.ceil(x * ratio / gs) * gs for x in (h, w)) return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean def copy_attr(a, b, include=(), exclude=()): # Copy attributes from b to a, options to only include [...] and to exclude [...] for k, v in b.__dict__.items(): if (len(include) and k not in include) or k.startswith('_') or k in exclude: continue else: setattr(a, k, v) def smart_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5): # YOLOv5 3-param group optimizer: 0) weights with decay, 1) weights no decay, 2) biases no decay g = [], [], [] # optimizer parameter groups bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d() for v in model.modules(): for p_name, p in v.named_parameters(recurse=0): if p_name == 'bias': # bias (no decay) g[2].append(p) elif p_name == 'weight' and isinstance(v, bn): # weight (no decay) g[1].append(p) else: g[0].append(p) # weight (with decay) if name == 'Adam': optimizer = torch.optim.Adam(g[2], lr=lr, betas=(momentum, 0.999)) # adjust beta1 to momentum elif name == 'AdamW': optimizer = torch.optim.AdamW(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0) elif name == 'RMSProp': optimizer = torch.optim.RMSprop(g[2], lr=lr, momentum=momentum) elif name == 'SGD': optimizer = torch.optim.SGD(g[2], lr=lr, momentum=momentum, nesterov=True) else: raise NotImplementedError(f'Optimizer {name} not implemented.') optimizer.add_param_group({'params': g[0], 'weight_decay': decay}) # add g0 with weight_decay optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0}) # add g1 (BatchNorm2d weights) LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__}(lr={lr}) with parameter groups " f'{len(g[1])} weight(decay=0.0), {len(g[0])} weight(decay={decay}), {len(g[2])} bias') return optimizer def smart_hub_load(repo='ultralytics/yolov5', model='yolov5s', **kwargs): # YOLOv5 torch.hub.load() wrapper with smart error/issue handling if check_version(torch.__version__, '1.9.1'): kwargs['skip_validation'] = True # validation causes GitHub API rate limit errors if check_version(torch.__version__, '1.12.0'): kwargs['trust_repo'] = True # argument required starting in torch 0.12 try: return torch.hub.load(repo, model, **kwargs) except Exception: return torch.hub.load(repo, model, force_reload=True, **kwargs) def smart_resume(ckpt, optimizer, ema=None, weights='yolov5s.pt', epochs=300, resume=True): # Resume training from a partially trained checkpoint best_fitness = 0.0 start_epoch = ckpt['epoch'] + 1 if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) # optimizer best_fitness = ckpt['best_fitness'] if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) # EMA ema.updates = ckpt['updates'] if resume: assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.\n' \ f"Start a new training without --resume, i.e. 'python train.py --weights {weights}'" LOGGER.info(f'Resuming training from {weights} from epoch {start_epoch} to {epochs} total epochs') if epochs < start_epoch: LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.") epochs += ckpt['epoch'] # finetune additional epochs return best_fitness, start_epoch, epochs class EarlyStopping: # YOLOv5 simple early stopper def __init__(self, patience=30): self.best_fitness = 0.0 # i.e. mAP self.best_epoch = 0 self.patience = patience or float('inf') # epochs to wait after fitness stops improving to stop self.possible_stop = False # possible stop may occur next epoch def __call__(self, epoch, fitness): if fitness >= self.best_fitness: # >= 0 to allow for early zero-fitness stage of training self.best_epoch = epoch self.best_fitness = fitness delta = epoch - self.best_epoch # epochs without improvement self.possible_stop = delta >= (self.patience - 1) # possible stop may occur next epoch stop = delta >= self.patience # stop training if patience exceeded if stop: LOGGER.info(f'Stopping training early as no improvement observed in last {self.patience} epochs. ' f'Best results observed at epoch {self.best_epoch}, best model saved as best.pt.\n' f'To update EarlyStopping(patience={self.patience}) pass a new patience value, ' f'i.e. `python train.py --patience 300` or use `--patience 0` to disable EarlyStopping.') return stop class ModelEMA: """ Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models Keeps a moving average of everything in the model state_dict (parameters and buffers) For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage """ def __init__(self, model, decay=0.9999, tau=2000, updates=0): # Create EMA self.ema = deepcopy(de_parallel(model)).eval() # FP32 EMA self.updates = updates # number of EMA updates self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs) for p in self.ema.parameters(): p.requires_grad_(False) def update(self, model): # Update EMA parameters self.updates += 1 d = self.decay(self.updates) msd = de_parallel(model).state_dict() # model state_dict for k, v in self.ema.state_dict().items(): if v.dtype.is_floating_point: # true for FP16 and FP32 v *= d v += (1 - d) * msd[k].detach() # assert v.dtype == msd[k].dtype == torch.float32, f'{k}: EMA {v.dtype} and model {msd[k].dtype} must be FP32' def update_attr(self, model, include=(), exclude=('process_group', 'reducer')): # Update EMA attributes copy_attr(self.ema, model, include, exclude) ================================================ FILE: yolo-improve/yolov5-AUX/utils/triton.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Utils to interact with the Triton Inference Server """ import typing from urllib.parse import urlparse import torch class TritonRemoteModel: """ A wrapper over a model served by the Triton Inference Server. It can be configured to communicate over GRPC or HTTP. It accepts Torch Tensors as input and returns them as outputs. """ def __init__(self, url: str): """ Keyword arguments: url: Fully qualified address of the Triton server - for e.g. grpc://localhost:8000 """ parsed_url = urlparse(url) if parsed_url.scheme == 'grpc': from tritonclient.grpc import InferenceServerClient, InferInput self.client = InferenceServerClient(parsed_url.netloc) # Triton GRPC client model_repository = self.client.get_model_repository_index() self.model_name = model_repository.models[0].name self.metadata = self.client.get_model_metadata(self.model_name, as_json=True) def create_input_placeholders() -> typing.List[InferInput]: return [ InferInput(i['name'], [int(s) for s in i['shape']], i['datatype']) for i in self.metadata['inputs']] else: from tritonclient.http import InferenceServerClient, InferInput self.client = InferenceServerClient(parsed_url.netloc) # Triton HTTP client model_repository = self.client.get_model_repository_index() self.model_name = model_repository[0]['name'] self.metadata = self.client.get_model_metadata(self.model_name) def create_input_placeholders() -> typing.List[InferInput]: return [ InferInput(i['name'], [int(s) for s in i['shape']], i['datatype']) for i in self.metadata['inputs']] self._create_input_placeholders_fn = create_input_placeholders @property def runtime(self): """Returns the model runtime""" return self.metadata.get('backend', self.metadata.get('platform')) def __call__(self, *args, **kwargs) -> typing.Union[torch.Tensor, typing.Tuple[torch.Tensor, ...]]: """ Invokes the model. Parameters can be provided via args or kwargs. args, if provided, are assumed to match the order of inputs of the model. kwargs are matched with the model input names. """ inputs = self._create_inputs(*args, **kwargs) response = self.client.infer(model_name=self.model_name, inputs=inputs) result = [] for output in self.metadata['outputs']: tensor = torch.as_tensor(response.as_numpy(output['name'])) result.append(tensor) return result[0] if len(result) == 1 else result def _create_inputs(self, *args, **kwargs): args_len, kwargs_len = len(args), len(kwargs) if not args_len and not kwargs_len: raise RuntimeError('No inputs provided.') if args_len and kwargs_len: raise RuntimeError('Cannot specify args and kwargs at the same time') placeholders = self._create_input_placeholders_fn() if args_len: if args_len != len(placeholders): raise RuntimeError(f'Expected {len(placeholders)} inputs, got {args_len}.') for input, value in zip(placeholders, args): input.set_data_from_numpy(value.cpu().numpy()) else: for input in placeholders: value = kwargs[input.name] input.set_data_from_numpy(value.cpu().numpy()) return placeholders ================================================ FILE: yolo-improve/yolov5-AUX/val.py ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Validate a trained YOLOv5 detection model on a detection dataset Usage: $ python val.py --weights yolov5s.pt --data coco128.yaml --img 640 Usage - formats: $ python val.py --weights yolov5s.pt # PyTorch yolov5s.torchscript # TorchScript yolov5s.onnx # ONNX Runtime or OpenCV DNN with --dnn yolov5s_openvino_model # OpenVINO yolov5s.engine # TensorRT yolov5s.mlmodel # CoreML (macOS-only) yolov5s_saved_model # TensorFlow SavedModel yolov5s.pb # TensorFlow GraphDef yolov5s.tflite # TensorFlow Lite yolov5s_edgetpu.tflite # TensorFlow Edge TPU yolov5s_paddle_model # PaddlePaddle """ import argparse import json import os import subprocess import sys from pathlib import Path import numpy as np import torch from tqdm import tqdm FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative from models.common import DetectMultiBackend from utils.callbacks import Callbacks from utils.dataloaders import create_dataloader from utils.general import (LOGGER, TQDM_BAR_FORMAT, Profile, check_dataset, check_img_size, check_requirements, check_yaml, coco80_to_coco91_class, colorstr, increment_path, non_max_suppression, print_args, scale_boxes, xywh2xyxy, xyxy2xywh) from utils.metrics import ConfusionMatrix, ap_per_class, box_iou from utils.plots import output_to_target, plot_images, plot_val_study from utils.torch_utils import select_device, smart_inference_mode def save_one_txt(predn, save_conf, shape, file): # Save one txt result gn = torch.tensor(shape)[[1, 0, 1, 0]] # normalization gain whwh for *xyxy, conf, cls in predn.tolist(): xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format with open(file, 'a') as f: f.write(('%g ' * len(line)).rstrip() % line + '\n') def save_one_json(predn, jdict, path, class_map): # Save one JSON result {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236} image_id = int(path.stem) if path.stem.isnumeric() else path.stem box = xyxy2xywh(predn[:, :4]) # xywh box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner for p, b in zip(predn.tolist(), box.tolist()): jdict.append({ 'image_id': image_id, 'category_id': class_map[int(p[5])], 'bbox': [round(x, 3) for x in b], 'score': round(p[4], 5)}) def process_batch(detections, labels, iouv): """ Return correct prediction matrix Arguments: detections (array[N, 6]), x1, y1, x2, y2, conf, class labels (array[M, 5]), class, x1, y1, x2, y2 Returns: correct (array[N, 10]), for 10 IoU levels """ correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) iou = box_iou(labels[:, 1:], detections[:, :4]) correct_class = labels[:, 0:1] == detections[:, 5] for i in range(len(iouv)): x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match if x[0].shape[0]: matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou] if x[0].shape[0] > 1: matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 1], return_index=True)[1]] # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] correct[matches[:, 1].astype(int), i] = True return torch.tensor(correct, dtype=torch.bool, device=iouv.device) @smart_inference_mode() def run( data, weights=None, # model.pt path(s) batch_size=32, # batch size imgsz=640, # inference size (pixels) conf_thres=0.001, # confidence threshold iou_thres=0.6, # NMS IoU threshold max_det=300, # maximum detections per image task='val', # train, val, test, speed or study device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu workers=8, # max dataloader workers (per RANK in DDP mode) single_cls=False, # treat as single-class dataset augment=False, # augmented inference verbose=False, # verbose output save_txt=False, # save results to *.txt save_hybrid=False, # save label+prediction hybrid results to *.txt save_conf=False, # save confidences in --save-txt labels save_json=False, # save a COCO-JSON results file project=ROOT / 'runs/val', # save to project/name name='exp', # save to project/name exist_ok=False, # existing project/name ok, do not increment half=True, # use FP16 half-precision inference dnn=False, # use OpenCV DNN for ONNX inference model=None, dataloader=None, save_dir=Path(''), plots=True, callbacks=Callbacks(), compute_loss=None, ): # Initialize/load model and set device training = model is not None if training: # called by train.py device, pt, jit, engine = next(model.parameters()).device, True, False, False # get model device, PyTorch model half &= device.type != 'cpu' # half precision only supported on CUDA model.half() if half else model.float() else: # called directly device = select_device(device, batch_size=batch_size) # Directories save_dir = increment_path(Path(project) / name, exist_ok=exist_ok) # increment run (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir # Load model model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half) stride, pt, jit, engine = model.stride, model.pt, model.jit, model.engine imgsz = check_img_size(imgsz, s=stride) # check image size half = model.fp16 # FP16 supported on limited backends with CUDA if engine: batch_size = model.batch_size else: device = model.device if not (pt or jit): batch_size = 1 # export.py models default to batch-size 1 LOGGER.info(f'Forcing --batch-size 1 square inference (1,3,{imgsz},{imgsz}) for non-PyTorch models') # Data data = check_dataset(data) # check # Configure model.eval() cuda = device.type != 'cpu' is_coco = isinstance(data.get('val'), str) and data['val'].endswith(f'coco{os.sep}val2017.txt') # COCO dataset nc = 1 if single_cls else int(data['nc']) # number of classes iouv = torch.linspace(0.5, 0.95, 10, device=device) # iou vector for mAP@0.5:0.95 niou = iouv.numel() # Dataloader if not training: if pt and not single_cls: # check --weights are trained on --data ncm = model.model.nc assert ncm == nc, f'{weights} ({ncm} classes) trained on different --data than what you passed ({nc} ' \ f'classes). Pass correct combination of --weights and --data that are trained together.' model.warmup(imgsz=(1 if pt else batch_size, 3, imgsz, imgsz)) # warmup pad, rect = (0.0, False) if task == 'speed' else (0.5, pt) # square inference for benchmarks task = task if task in ('train', 'val', 'test') else 'val' # path to train/val/test images dataloader = create_dataloader(data[task], imgsz, batch_size, stride, single_cls, pad=pad, rect=rect, workers=workers, prefix=colorstr(f'{task}: '))[0] seen = 0 confusion_matrix = ConfusionMatrix(nc=nc) names = model.names if hasattr(model, 'names') else model.module.names # get class names if isinstance(names, (list, tuple)): # old format names = dict(enumerate(names)) class_map = coco80_to_coco91_class() if is_coco else list(range(1000)) s = ('%22s' + '%11s' * 6) % ('Class', 'Images', 'Instances', 'P', 'R', 'mAP50', 'mAP50-95') tp, fp, p, r, f1, mp, mr, map50, ap50, map = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 dt = Profile(), Profile(), Profile() # profiling times loss = torch.zeros(3, device=device) jdict, stats, ap, ap_class = [], [], [], [] callbacks.run('on_val_start') pbar = tqdm(dataloader, desc=s, bar_format=TQDM_BAR_FORMAT) # progress bar for batch_i, (im, targets, paths, shapes) in enumerate(pbar): callbacks.run('on_val_batch_start') with dt[0]: if cuda: im = im.to(device, non_blocking=True) targets = targets.to(device) im = im.half() if half else im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 nb, _, height, width = im.shape # batch size, channels, height, width # Inference with dt[1]: preds, train_out = model(im) if compute_loss else (model(im, augment=augment), None) # Loss if compute_loss: loss += compute_loss(train_out, targets)[1] # box, obj, cls # NMS targets[:, 2:] *= torch.tensor((width, height, width, height), device=device) # to pixels lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else [] # for autolabelling with dt[2]: preds = non_max_suppression(preds, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls, max_det=max_det) # Metrics for si, pred in enumerate(preds): labels = targets[targets[:, 0] == si, 1:] nl, npr = labels.shape[0], pred.shape[0] # number of labels, predictions path, shape = Path(paths[si]), shapes[si][0] correct = torch.zeros(npr, niou, dtype=torch.bool, device=device) # init seen += 1 if npr == 0: if nl: stats.append((correct, *torch.zeros((2, 0), device=device), labels[:, 0])) if plots: confusion_matrix.process_batch(detections=None, labels=labels[:, 0]) continue # Predictions if single_cls: pred[:, 5] = 0 predn = pred.clone() scale_boxes(im[si].shape[1:], predn[:, :4], shape, shapes[si][1]) # native-space pred # Evaluate if nl: tbox = xywh2xyxy(labels[:, 1:5]) # target boxes scale_boxes(im[si].shape[1:], tbox, shape, shapes[si][1]) # native-space labels labelsn = torch.cat((labels[:, 0:1], tbox), 1) # native-space labels correct = process_batch(predn, labelsn, iouv) if plots: confusion_matrix.process_batch(predn, labelsn) stats.append((correct, pred[:, 4], pred[:, 5], labels[:, 0])) # (correct, conf, pcls, tcls) # Save/log if save_txt: save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt') if save_json: save_one_json(predn, jdict, path, class_map) # append to COCO-JSON dictionary callbacks.run('on_val_image_end', pred, predn, path, names, im[si]) # Plot images if plots and batch_i < 3: plot_images(im, targets, paths, save_dir / f'val_batch{batch_i}_labels.jpg', names) # labels plot_images(im, output_to_target(preds), paths, save_dir / f'val_batch{batch_i}_pred.jpg', names) # pred callbacks.run('on_val_batch_end', batch_i, im, targets, paths, shapes, preds) # Compute metrics stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*stats)] # to numpy if len(stats) and stats[0].any(): tp, fp, p, r, f1, ap, ap_class = ap_per_class(*stats, plot=plots, save_dir=save_dir, names=names) ap50, ap = ap[:, 0], ap.mean(1) # AP@0.5, AP@0.5:0.95 mp, mr, map50, map = p.mean(), r.mean(), ap50.mean(), ap.mean() nt = np.bincount(stats[3].astype(int), minlength=nc) # number of targets per class # Print results pf = '%22s' + '%11i' * 2 + '%11.3g' * 4 # print format LOGGER.info(pf % ('all', seen, nt.sum(), mp, mr, map50, map)) if nt.sum() == 0: LOGGER.warning(f'WARNING ⚠️ no labels found in {task} set, can not compute metrics without labels') # Print results per class if (verbose or (nc < 50 and not training)) and nc > 1 and len(stats): for i, c in enumerate(ap_class): LOGGER.info(pf % (names[c], seen, nt[c], p[i], r[i], ap50[i], ap[i])) # Print speeds t = tuple(x.t / seen * 1E3 for x in dt) # speeds per image if not training: shape = (batch_size, 3, imgsz, imgsz) LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {shape}' % t) # Plots if plots: confusion_matrix.plot(save_dir=save_dir, names=list(names.values())) callbacks.run('on_val_end', nt, tp, fp, p, r, f1, ap, ap50, ap_class, confusion_matrix) # Save JSON if save_json and len(jdict): w = Path(weights[0] if isinstance(weights, list) else weights).stem if weights is not None else '' # weights anno_json = str(Path('../datasets/coco/annotations/instances_val2017.json')) # annotations pred_json = str(save_dir / f'{w}_predictions.json') # predictions LOGGER.info(f'\nEvaluating pycocotools mAP... saving {pred_json}...') with open(pred_json, 'w') as f: json.dump(jdict, f) try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb check_requirements('pycocotools>=2.0.6') from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval anno = COCO(anno_json) # init annotations api pred = anno.loadRes(pred_json) # init predictions api eval = COCOeval(anno, pred, 'bbox') if is_coco: eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.im_files] # image IDs to evaluate eval.evaluate() eval.accumulate() eval.summarize() map, map50 = eval.stats[:2] # update results (mAP@0.5:0.95, mAP@0.5) except Exception as e: LOGGER.info(f'pycocotools unable to run: {e}') # Return results model.float() # for training if not training: s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else '' LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}") maps = np.zeros(nc) + map for i, c in enumerate(ap_class): maps[c] = ap[i] return (mp, mr, map50, map, *(loss.cpu() / len(dataloader)).tolist()), maps, t def parse_opt(): parser = argparse.ArgumentParser() parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path') parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path(s)') parser.add_argument('--batch-size', type=int, default=32, help='batch size') parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)') parser.add_argument('--conf-thres', type=float, default=0.001, help='confidence threshold') parser.add_argument('--iou-thres', type=float, default=0.6, help='NMS IoU threshold') parser.add_argument('--max-det', type=int, default=300, help='maximum detections per image') parser.add_argument('--task', default='val', help='train, val, test, speed or study') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)') parser.add_argument('--single-cls', action='store_true', help='treat as single-class dataset') parser.add_argument('--augment', action='store_true', help='augmented inference') parser.add_argument('--verbose', action='store_true', help='report mAP by class') parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') parser.add_argument('--save-hybrid', action='store_true', help='save label+prediction hybrid results to *.txt') parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') parser.add_argument('--save-json', action='store_true', help='save a COCO-JSON results file') parser.add_argument('--project', default=ROOT / 'runs/val', help='save to project/name') parser.add_argument('--name', default='exp', help='save to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference') parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference') opt = parser.parse_args() opt.data = check_yaml(opt.data) # check YAML opt.save_json |= opt.data.endswith('coco.yaml') opt.save_txt |= opt.save_hybrid print_args(vars(opt)) return opt def main(opt): check_requirements(exclude=('tensorboard', 'thop')) if opt.task in ('train', 'val', 'test'): # run normally if opt.conf_thres > 0.001: # https://github.com/ultralytics/yolov5/issues/1466 LOGGER.info(f'WARNING ⚠️ confidence threshold {opt.conf_thres} > 0.001 produces invalid results') if opt.save_hybrid: LOGGER.info('WARNING ⚠️ --save-hybrid will return high mAP from hybrid labels, not from predictions alone') run(**vars(opt)) else: weights = opt.weights if isinstance(opt.weights, list) else [opt.weights] opt.half = torch.cuda.is_available() and opt.device != 'cpu' # FP16 for fastest results if opt.task == 'speed': # speed benchmarks # python val.py --task speed --data coco.yaml --batch 1 --weights yolov5n.pt yolov5s.pt... opt.conf_thres, opt.iou_thres, opt.save_json = 0.25, 0.45, False for opt.weights in weights: run(**vars(opt), plots=False) elif opt.task == 'study': # speed vs mAP benchmarks # python val.py --task study --data coco.yaml --iou 0.7 --weights yolov5n.pt yolov5s.pt... for opt.weights in weights: f = f'study_{Path(opt.data).stem}_{Path(opt.weights).stem}.txt' # filename to save to x, y = list(range(256, 1536 + 128, 128)), [] # x axis (image sizes), y axis for opt.imgsz in x: # img-size LOGGER.info(f'\nRunning {f} --imgsz {opt.imgsz}...') r, _, t = run(**vars(opt), plots=False) y.append(r + t) # results and times np.savetxt(f, y, fmt='%10.4g') # save subprocess.run(['zip', '-r', 'study.zip', 'study_*.txt']) plot_val_study(x=x) # plot else: raise NotImplementedError(f'--task {opt.task} not in ("train", "val", "test", "speed", "study")') if __name__ == '__main__': opt = parse_opt() main(opt) ================================================ FILE: yolo-improve/yolov5-C3RFEM.py ================================================ class TridentBlock(nn.Module): def __init__(self, c1, c2, stride=1, c=False, e=0.5, padding=[1, 2, 3], dilate=[1, 2, 3], bias=False): super(TridentBlock, self).__init__() self.stride = stride self.c = c c_ = int(c2 * e) self.padding = padding self.dilate = dilate self.share_weightconv1 = nn.Parameter(torch.Tensor(c_, c1, 1, 1)) self.share_weightconv2 = nn.Parameter(torch.Tensor(c2, c_, 3, 3)) self.bn1 = nn.BatchNorm2d(c_) self.bn2 = nn.BatchNorm2d(c2) self.act = nn.SiLU() nn.init.kaiming_uniform_(self.share_weightconv1, nonlinearity="relu") nn.init.kaiming_uniform_(self.share_weightconv2, nonlinearity="relu") if bias: self.bias = nn.Parameter(torch.Tensor(c2)) else: self.bias = None if self.bias is not None: nn.init.constant_(self.bias, 0) def forward_for_small(self, x): residual = x out = nn.functional.conv2d(x, self.share_weightconv1, bias=self.bias) out = self.bn1(out) out = self.act(out) out = nn.functional.conv2d(out, self.share_weightconv2, bias=self.bias, stride=self.stride, padding=self.padding[0], dilation=self.dilate[0]) out = self.bn2(out) out += residual out = self.act(out) return out def forward_for_middle(self, x): residual = x out = nn.functional.conv2d(x, self.share_weightconv1, bias=self.bias) out = self.bn1(out) out = self.act(out) out = nn.functional.conv2d(out, self.share_weightconv2, bias=self.bias, stride=self.stride, padding=self.padding[1], dilation=self.dilate[1]) out = self.bn2(out) out += residual out = self.act(out) return out def forward_for_big(self, x): residual = x out = nn.functional.conv2d(x, self.share_weightconv1, bias=self.bias) out = self.bn1(out) out = self.act(out) out = nn.functional.conv2d(out, self.share_weightconv2, bias=self.bias, stride=self.stride, padding=self.padding[2], dilation=self.dilate[2]) out = self.bn2(out) out += residual out = self.act(out) return out def forward(self, x): xm = x base_feat = [] if self.c is not False: x1 = self.forward_for_small(x) x2 = self.forward_for_middle(x) x3 = self.forward_for_big(x) else: x1 = self.forward_for_small(xm[0]) x2 = self.forward_for_middle(xm[1]) x3 = self.forward_for_big(xm[2]) base_feat.append(x1) base_feat.append(x2) base_feat.append(x3) return base_feat class RFEM(nn.Module): def __init__(self, c1, c2, n=1, e=0.5, stride=1): super(RFEM, self).__init__() c = True layers = [] layers.append(TridentBlock(c1, c2, stride=stride, c=c, e=e)) c1 = c2 for i in range(1, n): layers.append(TridentBlock(c1, c2)) self.layer = nn.Sequential(*layers) self.bn = nn.BatchNorm2d(c2) self.act = nn.SiLU() def forward(self, x): out = self.layer(x) out = out[0] + out[1] + out[2] + x out = self.act(self.bn(out)) return out class C3RFEM(C3): # C3 module with RFEM def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = nn.Sequential(*(RFEM(c_, c_, n=1, e=e) for _ in range(n))) # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 [-1, 1, C3RFEM, [1024]] # 10 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 14 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 18 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 15], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 21 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 11], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 24 (P5/32-large) [[18, 21, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-CARAFE.py ================================================ class CARAFE(nn.Module): def __init__(self, c, k_enc=3, k_up=5, c_mid=64, scale=2): """ The unofficial implementation of the CARAFE module. The details are in "https://arxiv.org/abs/1905.02188". Args: c: The channel number of the input and the output. c_mid: The channel number after compression. scale: The expected upsample scale. k_up: The size of the reassembly kernel. k_enc: The kernel size of the encoder. Returns: X: The upsampled feature map. """ super(CARAFE, self).__init__() self.scale = scale self.comp = Conv(c, c_mid) self.enc = Conv(c_mid, (scale*k_up)**2, k=k_enc, act=False) self.pix_shf = nn.PixelShuffle(scale) self.upsmp = nn.Upsample(scale_factor=scale, mode='nearest') self.unfold = nn.Unfold(kernel_size=k_up, dilation=scale, padding=k_up//2*scale) def forward(self, X): b, c, h, w = X.size() h_, w_ = h * self.scale, w * self.scale W = self.comp(X) # b * m * h * w W = self.enc(W) # b * 100 * h * w W = self.pix_shf(W) # b * 25 * h_ * w_ W = torch.softmax(W, dim=1) # b * 25 * h_ * w_ X = self.upsmp(X) # b * c * h_ * w_ X = self.unfold(X) # b * 25c * h_ * w_ X = X.view(b, c, -1, h_, w_) # b * 25 * c * h_ * w_ X = torch.einsum('bkhw,bckhw->bchw', [W, X]) # b * c * h_ * w_ return X elif m is CARAFE: c2 = ch[f] args = [c2, *args] ================================================ FILE: yolo-improve/yolov5-CCFM.py ================================================ class RepConv(nn.Module): """ RepConv is a basic rep-style block, including training and deploy status. This module is used in RT-DETR. Based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py """ default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): """Initializes Light Convolution layer with inputs, outputs & optional activation function.""" super().__init__() assert k == 3 and p == 1 self.g = g self.c1 = c1 self.c2 = c2 self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False) self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) def forward_fuse(self, x): """Forward process.""" return self.act(self.conv(x)) def forward(self, x): """Forward process.""" id_out = 0 if self.bn is None else self.bn(x) return self.act(self.conv1(x) + self.conv2(x) + id_out) def get_equivalent_kernel_bias(self): """Returns equivalent kernel and bias by adding 3x3 kernel, 1x1 kernel and identity kernel with their biases.""" kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) kernelid, biasid = self._fuse_bn_tensor(self.bn) return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _pad_1x1_to_3x3_tensor(self, kernel1x1): """Pads a 1x1 tensor to a 3x3 tensor.""" if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): """Generates appropriate kernels and biases for convolution by fusing branches of the neural network.""" if branch is None: return 0, 0 if isinstance(branch, Conv): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps elif isinstance(branch, nn.BatchNorm2d): if not hasattr(self, 'id_tensor'): input_dim = self.c1 // self.g kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32) for i in range(self.c1): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def fuse_convs(self): """Combines two convolution layers into a single layer and removes unused attributes from the class.""" if hasattr(self, 'conv'): return kernel, bias = self.get_equivalent_kernel_bias() self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels, out_channels=self.conv1.conv.out_channels, kernel_size=self.conv1.conv.kernel_size, stride=self.conv1.conv.stride, padding=self.conv1.conv.padding, dilation=self.conv1.conv.dilation, groups=self.conv1.conv.groups, bias=True).requires_grad_(False) self.conv.weight.data = kernel self.conv.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('conv1') self.__delattr__('conv2') if hasattr(self, 'nm'): self.__delattr__('nm') if hasattr(self, 'bn'): self.__delattr__('bn') if hasattr(self, 'id_tensor'): self.__delattr__('id_tensor') class RepC3(nn.Module): """Rep C3.""" def __init__(self, c1, c2, n=3, e=1.0): """Initialize CSP Bottleneck with a single convolution using input channels, output channels, and number.""" super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)]) self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity() def forward(self, x): """Forward pass of RT-DETR neck layer.""" return self.cv3(self.m(self.cv1(x)) + self.cv2(x)) # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, nn.Upsample, [None, 2, 'nearest']], # 10 [6, 1, Conv, [256, 1, 1, None, 1, 1, False]], [[-2, -1], 1, Concat, [1]], [-1, 3, RepC3, [256, 0.5]], [-1, 1, Conv, [256, 1, 1]], # 14 [-1, 1, nn.Upsample, [None, 2, 'nearest']], #15 [4, 1, Conv, [256, 1, 1, None, 1, 1, False]], [[-2, -1], 1, Concat, [1]], [-1, 3, RepC3, [256, 0.5]], # 18 [-1, 1, Conv, [256, 3, 2]], # 19 [[-1, 14], 1, Concat, [1]], [-1, 3, RepC3, [256, 0.5]], # 21 [-1, 1, Conv, [256, 3, 2]], # 22 [[-1, 9], 1, Concat, [1]], [-1, 3, RepC3, [256, 0.5]], # 24 [[18, 21, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-ContextAggregation.py ================================================ from mmcv.cnn import ConvModule from mmengine.model import caffe2_xavier_init, constant_init class ContextAggregation(nn.Module): """ Context Aggregation Block. Args: in_channels (int): Number of input channels. reduction (int, optional): Channel reduction ratio. Default: 1. conv_cfg (dict or None, optional): Config dict for the convolution layer. Default: None. """ def __init__(self, in_channels, reduction=1): super(ContextAggregation, self).__init__() self.in_channels = in_channels self.reduction = reduction self.inter_channels = max(in_channels // reduction, 1) conv_params = dict(kernel_size=1, act_cfg=None) self.a = ConvModule(in_channels, 1, **conv_params) self.k = ConvModule(in_channels, 1, **conv_params) self.v = ConvModule(in_channels, self.inter_channels, **conv_params) self.m = ConvModule(self.inter_channels, in_channels, **conv_params) self.init_weights() def init_weights(self): for m in (self.a, self.k, self.v): caffe2_xavier_init(m.conv) constant_init(self.m.conv, 0) def forward(self, x): n, c = x.size(0), self.inter_channels # a: [N, 1, H, W] a = self.a(x).sigmoid() # k: [N, 1, HW, 1] k = self.k(x).view(n, 1, -1, 1).softmax(2) # v: [N, 1, C, HW] v = self.v(x).view(n, 1, c, -1) # y: [N, C, 1, 1] y = torch.matmul(v, k).view(n, c, 1, 1) y = self.m(y) * a return x + y # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [17, 1, ContextAggregation, []], # 24 [20, 1, ContextAggregation, []], # 25 [23, 1, ContextAggregation, []], # 26 [[24, 25, 26], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-CoordConv.py ================================================ class AddCoords(nn.Module): def __init__(self, with_r=False): super().__init__() self.with_r = with_r def forward(self, input_tensor): """ Args: input_tensor: shape(batch, channel, x_dim, y_dim) """ batch_size, _, x_dim, y_dim = input_tensor.size() xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1) yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose(1, 2) xx_channel = xx_channel.float() / (x_dim - 1) yy_channel = yy_channel.float() / (y_dim - 1) xx_channel = xx_channel * 2 - 1 yy_channel = yy_channel * 2 - 1 xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3) yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3) ret = torch.cat([ input_tensor, xx_channel.type_as(input_tensor), yy_channel.type_as(input_tensor)], dim=1) if self.with_r: rr = torch.sqrt(torch.pow(xx_channel.type_as(input_tensor) - 0.5, 2) + torch.pow(yy_channel.type_as(input_tensor) - 0.5, 2)) ret = torch.cat([ret, rr], dim=1) return ret class CoordConv(nn.Module): def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, with_r=False): super().__init__() self.addcoords = AddCoords(with_r=with_r) in_channels += 2 if with_r: in_channels += 1 self.conv = Conv(in_channels, out_channels, k=kernel_size, s=stride) def forward(self, x): x = self.addcoords(x) x = self.conv(x) return x # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, CoordConv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, CoordConv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [17, 1, CoordConv, [256, 3, 1]], # 24 [20, 1, CoordConv, [512, 3, 1]], # 25 [23, 1, CoordConv, [1024, 3, 1]], # 26 [[24, 25, 26], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-DBB.py ================================================ import torch.nn.functional as F def transI_fusebn(kernel, bn): gamma = bn.weight std = (bn.running_var + bn.eps).sqrt() return kernel * ((gamma / std).reshape(-1, 1, 1, 1)), bn.bias - bn.running_mean * gamma / std def transII_addbranch(kernels, biases): return sum(kernels), sum(biases) def transIII_1x1_kxk(k1, b1, k2, b2, groups): if groups == 1: k = F.conv2d(k2, k1.permute(1, 0, 2, 3)) # b_hat = (k2 * b1.reshape(1, -1, 1, 1)).sum((1, 2, 3)) else: k_slices = [] b_slices = [] k1_T = k1.permute(1, 0, 2, 3) k1_group_width = k1.size(0) // groups k2_group_width = k2.size(0) // groups for g in range(groups): k1_T_slice = k1_T[:, g*k1_group_width:(g+1)*k1_group_width, :, :] k2_slice = k2[g*k2_group_width:(g+1)*k2_group_width, :, :, :] k_slices.append(F.conv2d(k2_slice, k1_T_slice)) b_slices.append((k2_slice * b1[g*k1_group_width:(g+1)*k1_group_width].reshape(1, -1, 1, 1)).sum((1, 2, 3))) k, b_hat = transIV_depthconcat(k_slices, b_slices) return k, b_hat + b2 def transIV_depthconcat(kernels, biases): return torch.cat(kernels, dim=0), torch.cat(biases) def transV_avg(channels, kernel_size, groups): input_dim = channels // groups k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 return k # This has not been tested with non-square kernels (kernel.size(2) != kernel.size(3)) nor even-size kernels def transVI_multiscale(kernel, target_kernel_size): H_pixels_to_pad = (target_kernel_size - kernel.size(2)) // 2 W_pixels_to_pad = (target_kernel_size - kernel.size(3)) // 2 return F.pad(kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad]) def conv_bn(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros'): conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=False, padding_mode=padding_mode) bn_layer = nn.BatchNorm2d(num_features=out_channels, affine=True) se = nn.Sequential() se.add_module('conv', conv_layer) se.add_module('bn', bn_layer) return se class IdentityBasedConv1x1(nn.Conv2d): def __init__(self, channels, groups=1): super(IdentityBasedConv1x1, self).__init__(in_channels=channels, out_channels=channels, kernel_size=1, stride=1, padding=0, groups=groups, bias=False) assert channels % groups == 0 input_dim = channels // groups id_value = np.zeros((channels, input_dim, 1, 1)) for i in range(channels): id_value[i, i % input_dim, 0, 0] = 1 self.id_tensor = torch.from_numpy(id_value).type_as(self.weight) nn.init.zeros_(self.weight) def forward(self, input): kernel = self.weight + self.id_tensor.to(self.weight.device).type_as(self.weight) result = F.conv2d(input, kernel, None, stride=1, padding=0, dilation=self.dilation, groups=self.groups) return result def get_actual_kernel(self): return self.weight + self.id_tensor.to(self.weight.device) class BNAndPadLayer(nn.Module): def __init__(self, pad_pixels, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True): super(BNAndPadLayer, self).__init__() self.bn = nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) self.pad_pixels = pad_pixels def forward(self, input): output = self.bn(input) if self.pad_pixels > 0: if self.bn.affine: pad_values = self.bn.bias.detach() - self.bn.running_mean * self.bn.weight.detach() / torch.sqrt(self.bn.running_var + self.bn.eps) else: pad_values = - self.bn.running_mean / torch.sqrt(self.bn.running_var + self.bn.eps) output = F.pad(output, [self.pad_pixels] * 4) pad_values = pad_values.view(1, -1, 1, 1) output[:, :, 0:self.pad_pixels, :] = pad_values output[:, :, -self.pad_pixels:, :] = pad_values output[:, :, :, 0:self.pad_pixels] = pad_values output[:, :, :, -self.pad_pixels:] = pad_values return output @property def weight(self): return self.bn.weight @property def bias(self): return self.bn.bias @property def running_mean(self): return self.bn.running_mean @property def running_var(self): return self.bn.running_var @property def eps(self): return self.bn.eps class DiverseBranchBlock(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1, internal_channels_1x1_3x3=None, deploy=False, single_init=False): super(DiverseBranchBlock, self).__init__() self.deploy = deploy self.nonlinear = Conv.default_act self.kernel_size = kernel_size self.out_channels = out_channels self.groups = groups if padding is None: padding = autopad(kernel_size, padding, dilation) assert padding == kernel_size // 2 if deploy: self.dbb_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=True) else: self.dbb_origin = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups) self.dbb_avg = nn.Sequential() if groups < out_channels: self.dbb_avg.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, groups=groups, bias=False)) self.dbb_avg.add_module('bn', BNAndPadLayer(pad_pixels=padding, num_features=out_channels)) self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=0)) self.dbb_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=0, groups=groups) else: self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=kernel_size, stride=stride, padding=padding)) self.dbb_avg.add_module('avgbn', nn.BatchNorm2d(out_channels)) if internal_channels_1x1_3x3 is None: internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels # For mobilenet, it is better to have 2X internal channels self.dbb_1x1_kxk = nn.Sequential() if internal_channels_1x1_3x3 == in_channels: self.dbb_1x1_kxk.add_module('idconv1', IdentityBasedConv1x1(channels=in_channels, groups=groups)) else: self.dbb_1x1_kxk.add_module('conv1', nn.Conv2d(in_channels=in_channels, out_channels=internal_channels_1x1_3x3, kernel_size=1, stride=1, padding=0, groups=groups, bias=False)) self.dbb_1x1_kxk.add_module('bn1', BNAndPadLayer(pad_pixels=padding, num_features=internal_channels_1x1_3x3, affine=True)) self.dbb_1x1_kxk.add_module('conv2', nn.Conv2d(in_channels=internal_channels_1x1_3x3, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=0, groups=groups, bias=False)) self.dbb_1x1_kxk.add_module('bn2', nn.BatchNorm2d(out_channels)) # The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases. if single_init: # Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting. self.single_init() def get_equivalent_kernel_bias(self): k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight, self.dbb_origin.bn) if hasattr(self, 'dbb_1x1'): k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn) k_1x1 = transVI_multiscale(k_1x1, self.kernel_size) else: k_1x1, b_1x1 = 0, 0 if hasattr(self.dbb_1x1_kxk, 'idconv1'): k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel() else: k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first, self.dbb_1x1_kxk.bn1) k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2) k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(k_1x1_kxk_first, b_1x1_kxk_first, k_1x1_kxk_second, b_1x1_kxk_second, groups=self.groups) k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups) k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg.to(self.dbb_avg.avgbn.weight.device), self.dbb_avg.avgbn) if hasattr(self.dbb_avg, 'conv'): k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(self.dbb_avg.conv.weight, self.dbb_avg.bn) k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(k_1x1_avg_first, b_1x1_avg_first, k_1x1_avg_second, b_1x1_avg_second, groups=self.groups) else: k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second return transII_addbranch((k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged), (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged)) def switch_to_deploy(self): if hasattr(self, 'dbb_reparam'): return kernel, bias = self.get_equivalent_kernel_bias() self.dbb_reparam = nn.Conv2d(in_channels=self.dbb_origin.conv.in_channels, out_channels=self.dbb_origin.conv.out_channels, kernel_size=self.dbb_origin.conv.kernel_size, stride=self.dbb_origin.conv.stride, padding=self.dbb_origin.conv.padding, dilation=self.dbb_origin.conv.dilation, groups=self.dbb_origin.conv.groups, bias=True) self.dbb_reparam.weight.data = kernel self.dbb_reparam.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('dbb_origin') self.__delattr__('dbb_avg') if hasattr(self, 'dbb_1x1'): self.__delattr__('dbb_1x1') self.__delattr__('dbb_1x1_kxk') def forward(self, inputs): if hasattr(self, 'dbb_reparam'): return self.nonlinear(self.dbb_reparam(inputs)) out = self.dbb_origin(inputs) if hasattr(self, 'dbb_1x1'): out += self.dbb_1x1(inputs) out += self.dbb_avg(inputs) out += self.dbb_1x1_kxk(inputs) return self.nonlinear(out) def init_gamma(self, gamma_value): if hasattr(self, "dbb_origin"): torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value) if hasattr(self, "dbb_1x1"): torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value) if hasattr(self, "dbb_avg"): torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value) if hasattr(self, "dbb_1x1_kxk"): torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value) def single_init(self): self.init_gamma(0.0) if hasattr(self, "dbb_origin"): torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0) class Bottleneck_DBB(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = DiverseBranchBlock(c_, c2, 3, 1, groups=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3_DBB(C3): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) # hidden channels self.m = nn.Sequential(*(Bottleneck_DBB(c_, c_, shortcut, g, e=1.0) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-DCN.py ================================================ class DCNv2(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, dilation=1, groups=1, deformable_groups=1): super(DCNv2, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = (stride, stride) self.padding = (padding, padding) self.dilation = (dilation, dilation) self.groups = groups self.deformable_groups = deformable_groups self.weight = nn.Parameter( torch.empty(out_channels, in_channels, *self.kernel_size) ) self.bias = nn.Parameter(torch.empty(out_channels)) out_channels_offset_mask = (self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]) self.conv_offset_mask = nn.Conv2d( self.in_channels, out_channels_offset_mask, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True, ) self.bn = nn.BatchNorm2d(out_channels) self.act = Conv.default_act self.reset_parameters() def forward(self, x): offset_mask = self.conv_offset_mask(x) o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) x = torch.ops.torchvision.deform_conv2d( x, self.weight, offset, mask, self.bias, self.stride[0], self.stride[1], self.padding[0], self.padding[1], self.dilation[0], self.dilation[1], self.groups, self.deformable_groups, True ) x = self.bn(x) x = self.act(x) return x def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k std = 1. / math.sqrt(n) self.weight.data.uniform_(-std, std) self.bias.data.zero_() self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() class Bottleneck_DCN(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = DCNv2(c_, c2, 3, 1, groups=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3_DCN(C3): # C3 module with DCNv2 def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = nn.Sequential(*(Bottleneck_DCN(c_, c_, shortcut, g, e=1.0) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-DCNV3/commod.py ================================================ from models.ops_dcnv3.modules import DCNv3 class DCNV3_YoLo(nn.Module): def __init__(self, inc, ouc, k=1, s=1, p=None, g=1, d=1, act=True): super().__init__() self.conv = Conv(inc, ouc, k=1) self.dcnv3 = DCNv3(ouc, kernel_size=k, stride=s, group=g, dilation=d) self.bn = nn.BatchNorm2d(ouc) self.act = Conv.default_act def forward(self, x): x = self.conv(x) x = x.permute(0, 2, 3, 1) x = self.dcnv3(x) x = x.permute(0, 3, 1, 2) x = self.act(self.bn(x)) return x class Bottleneck_DCNV3(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = DCNV3_YoLo(c_, c2, 3, 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3_DCNV3(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) self.m = nn.Sequential(*(Bottleneck_DCNV3(c_, c_, shortcut, g, e=1.0) for _ in range(n))) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) # models/yolo.py DetectionModel class self.model.to(torch.device('cuda')) m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s).to(torch.device('cuda')))]).cpu() # forward self.model.cpu() ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/functions/__init__.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/functions/dcnv3_func.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.cuda.amp import custom_bwd, custom_fwd import DCNv3 class DCNv3Function(Function): @staticmethod @custom_fwd def forward( ctx, input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, im2col_step): ctx.kernel_h = kernel_h ctx.kernel_w = kernel_w ctx.stride_h = stride_h ctx.stride_w = stride_w ctx.pad_h = pad_h ctx.pad_w = pad_w ctx.dilation_h = dilation_h ctx.dilation_w = dilation_w ctx.group = group ctx.group_channels = group_channels ctx.offset_scale = offset_scale ctx.im2col_step = im2col_step output = DCNv3.dcnv3_forward( input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, ctx.im2col_step) ctx.save_for_backward(input, offset, mask) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): input, offset, mask = ctx.saved_tensors grad_input, grad_offset, grad_mask = \ DCNv3.dcnv3_backward( input, offset, mask, ctx.kernel_h, ctx.kernel_w, ctx.stride_h, ctx.stride_w, ctx.pad_h, ctx.pad_w, ctx.dilation_h, ctx.dilation_w, ctx.group, ctx.group_channels, ctx.offset_scale, grad_output.contiguous(), ctx.im2col_step) return grad_input, grad_offset, grad_mask, \ None, None, None, None, None, None, None, None, None, None, None, None @staticmethod def symbolic(g, input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, im2col_step): """Symbolic function for mmdeploy::DCNv3. Returns: DCNv3 op for onnx. """ return g.op( 'mmdeploy::TRTDCNv3', input, offset, mask, kernel_h_i=int(kernel_h), kernel_w_i=int(kernel_w), stride_h_i=int(stride_h), stride_w_i=int(stride_w), pad_h_i=int(pad_h), pad_w_i=int(pad_w), dilation_h_i=int(dilation_h), dilation_w_i=int(dilation_w), group_i=int(group), group_channels_i=int(group_channels), offset_scale_f=float(offset_scale), im2col_step_i=int(im2col_step), ) def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1): _, H_, W_, _ = spatial_shapes H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1 W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1 ref_y, ref_x = torch.meshgrid( torch.linspace( # pad_h + 0.5, # H_ - pad_h - 0.5, (dilation_h * (kernel_h - 1)) // 2 + 0.5, (dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h, H_out, dtype=torch.float32, device=device), torch.linspace( # pad_w + 0.5, # W_ - pad_w - 0.5, (dilation_w * (kernel_w - 1)) // 2 + 0.5, (dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w, W_out, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / H_ ref_x = ref_x.reshape(-1)[None] / W_ ref = torch.stack((ref_x, ref_y), -1).reshape( 1, H_out, W_out, 1, 2) return ref def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h, dilation_w, group, device): _, H_, W_, _ = spatial_shapes points_list = [] x, y = torch.meshgrid( torch.linspace( -((dilation_w * (kernel_w - 1)) // 2), -((dilation_w * (kernel_w - 1)) // 2) + (kernel_w - 1) * dilation_w, kernel_w, dtype=torch.float32, device=device), torch.linspace( -((dilation_h * (kernel_h - 1)) // 2), -((dilation_h * (kernel_h - 1)) // 2) + (kernel_h - 1) * dilation_h, kernel_h, dtype=torch.float32, device=device)) points_list.extend([x / W_, y / H_]) grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\ repeat(1, group, 1).permute(1, 0, 2) grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2) return grid def dcnv3_core_pytorch( input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale): # for debug and test only, # need to use cuda version instead input = F.pad( input, [0, 0, pad_h, pad_h, pad_w, pad_w]) N_, H_in, W_in, _ = input.shape _, H_out, W_out, _ = offset.shape ref = _get_reference_points( input.shape, input.device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w) grid = _generate_dilation_grids( input.shape, kernel_h, kernel_w, dilation_h, dilation_w, group, input.device) spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\ repeat(1, 1, 1, group*kernel_h*kernel_w).to(input.device) sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1).flatten(3, 4) + \ offset * offset_scale / spatial_norm P_ = kernel_h * kernel_w sampling_grids = 2 * sampling_locations - 1 # N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\ reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ flatten(0, 1) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) # (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_) mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\ reshape(N_*group, 1, H_out*W_out, P_) output = (sampling_input_ * mask).sum(-1).view(N_, group*group_channels, H_out*W_out) return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous() ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/make.sh ================================================ #!/usr/bin/env bash # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- python setup.py build install ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/modules/__init__.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from .dcnv3 import DCNv3 ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/modules/dcnv3.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ from ..functions import DCNv3Function, dcnv3_core_pytorch def autopad(k, p=None, d=1): # kernel, padding, dilation # Pad to 'same' shape outputs if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv(nn.Module): # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation) default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): super().__init__() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): return self.act(self.conv(x)) def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class DCNv3(nn.Module): def __init__( self, channels=64, kernel_size=3, stride=1, pad=1, dilation=1, group=4, offset_scale=1.0, act_layer='GELU', norm_layer='LN'): """ DCNv3 Module :param channels :param kernel_size :param stride :param pad :param dilation :param group :param offset_scale :param act_layer :param norm_layer """ super().__init__() if channels % group != 0: raise ValueError( f'channels must be divisible by group, but got {channels} and {group}') _d_per_group = channels // group # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_group): warnings.warn( "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.offset_scale = offset_scale self.channels = channels self.kernel_size = kernel_size self.stride = stride self.dilation = 1 self.pad = pad self.group = group self.group_channels = channels // group self.offset_scale = offset_scale self.dw_conv = Conv(channels, channels, kernel_size, g=channels) self.offset = nn.Linear( channels, group * kernel_size * kernel_size * 2) self.mask = nn.Linear( channels, group * kernel_size * kernel_size) self.input_proj = nn.Linear(channels, channels) self.output_proj = nn.Linear(channels, channels) self._reset_parameters() def _reset_parameters(self): constant_(self.offset.weight.data, 0.) constant_(self.offset.bias.data, 0.) constant_(self.mask.weight.data, 0.) constant_(self.mask.bias.data, 0.) xavier_uniform_(self.input_proj.weight.data) constant_(self.input_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, input): """ :param query (N, H, W, C) :return output (N, H, W, C) """ N, H, W, _ = input.shape x = self.input_proj(input) dtype = x.dtype x1 = input.permute(0, 3, 1, 2) x1 = self.dw_conv(x1).permute(0, 2, 3, 1) offset = self.offset(x1) mask = self.mask(x1).reshape(N, H, W, self.group, -1) mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype) x = DCNv3Function.apply( x, offset, mask, self.kernel_size, self.kernel_size, self.stride, self.stride, self.pad, self.pad, self.dilation, self.dilation, self.group, self.group_channels, self.offset_scale, 256) x = self.output_proj(x) return x ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/setup.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] if torch.cuda.is_available() and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ # "-DCUDA_HAS_FP16=1", # "-D__CUDA_NO_HALF_OPERATORS__", # "-D__CUDA_NO_HALF_CONVERSIONS__", # "-D__CUDA_NO_HALF2_OPERATORS__", ] else: raise NotImplementedError('Cuda is not availabel') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "DCNv3", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="DCNv3", version="1.0", author="InternImage", url="https://github.com/OpenGVLab/InternImage", description= "PyTorch Wrapper for CUDA Functions of DCNv3", packages=find_packages(exclude=( "configs", "tests", )), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/cpu/dcnv3_cpu.cpp ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include #include #include at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/cpu/dcnv3_cpu.h ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step); std::vector dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/cuda/dcnv3_cuda.cu ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include "cuda/dcnv3_im2col_cuda.cuh" #include #include #include #include #include #include at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step) { AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous"); AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous"); AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); const int batch = input.size(0); const int height_in = input.size(1); const int width_in = input.size(2); const int channels = input.size(3); const int height_out = (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); AT_ASSERTM( channels == (group * group_channels), "Input channels and group times group channels wont match: (%d vs %d).", channels, group * group_channels); auto output = at::zeros({batch, height_out, width_out, group * group_channels}, input.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch / batch_n, batch_n, height_out, width_out, group * group_channels}); auto per_input_size = height_in * width_in * group * group_channels; auto per_offset_size = height_out * width_out * group * kernel_h * kernel_w * 2; auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w; for (int n = 0; n < batch / im2col_step_; ++n) { auto columns = output_n.select(0, n); // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + n * im2col_step_ * per_offset_size, mask.data() + n * im2col_step_ * per_mask_size, columns.data(), kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, batch_n, height_in, width_in, height_out, width_out, offset_scale); })); } return output; } std::vector dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous"); AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous"); AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = input.size(0); const int height_in = input.size(1); const int width_in = input.size(2); const int channels = input.size(3); const int height_out = (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); AT_ASSERTM( channels == (group * group_channels), "Input channels and group times group channels wont match: (%d vs %d).", channels, group * group_channels); auto dtype = input.dtype(); if (dtype == at::kHalf) { dtype = at::kFloat; } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); auto grad_mask = at::zeros_like(mask, dtype); const int batch_n = im2col_step_; auto per_input_size = height_in * width_in * group * group_channels; auto per_offset_size = height_out * width_out * group * kernel_h * kernel_w * 2; auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w; auto grad_output_n = grad_output.view({batch / im2col_step_, batch_n, height_out * width_out, group, group_channels}); for (int n = 0; n < batch / im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, offset.data() + n * im2col_step_ * per_offset_size, mask.data() + n * im2col_step_ * per_mask_size, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, batch_n, height_in, width_in, height_out, width_out, offset_scale, grad_input.data() + n * im2col_step_ * per_input_size, grad_offset.data() + n * im2col_step_ * per_offset_size, grad_mask.data() + n * im2col_step_ * per_mask_size); })); } if (input.dtype() == torch::kHalf) { return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf), grad_mask.to(torch::kHalf)}; } else { return {grad_input, grad_offset, grad_mask}; } } ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/cuda/dcnv3_cuda.h ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step); std::vector dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 256; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } #define opmath_t at::opmath_type template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, const int &height, const int &width, const int &group, const int &group_channels, const opmath_t &h, const opmath_t &w, const int &g, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const opmath_t lh = h - h_low; const opmath_t lw = w - w_low; const opmath_t hh = 1 - lh, hw = 1 - lw; const int w_stride = group * group_channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = g * group_channels + c; opmath_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } opmath_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } opmath_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } opmath_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void dcnv3_col2im_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &group_channels, const opmath_t &h, const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale, const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const opmath_t lh = h - h_low; const opmath_t lw = w - w_low; const opmath_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * group_channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * group_channels + c; const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const opmath_t top_grad_im = top_grad * mask; opmath_t grad_h_weight = 0, grad_w_weight = 0; opmath_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_im + ptr1, w1 * top_grad_im); } opmath_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_im + ptr2, w2 * top_grad_im); } opmath_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_im + ptr3, w3 * top_grad_im); } opmath_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_im + ptr4, w4 * top_grad_im); } const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_mask = top_grad * val; *grad_offset = offset_scale * grad_w_weight * top_grad_im; *(grad_offset + 1) = offset_scale * grad_h_weight * top_grad_im; } template __device__ void dcnv3_col2im_bilinear_gm( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &group_channels, const opmath_t &h, const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale, const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const opmath_t lh = h - h_low; const opmath_t lw = w - w_low; const opmath_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * group_channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * group_channels + c; const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const opmath_t top_grad_im = top_grad * mask; opmath_t grad_h_weight = 0, grad_w_weight = 0; opmath_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_im + ptr1, w1 * top_grad_im); } opmath_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_im + ptr2, w2 * top_grad_im); } opmath_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_im + ptr3, w3 * top_grad_im); } opmath_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_im + ptr4, w4 * top_grad_im); } const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_mask, top_grad * val); atomicAdd(grad_offset, offset_scale * grad_w_weight * top_grad_im); atomicAdd(grad_offset + 1, offset_scale * grad_h_weight * top_grad_im); } template __global__ void dcnv3_im2col_gpu_kernel( const int num_kernels, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale) { CUDA_KERNEL_LOOP(index, num_kernels) { int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const int input_size = height_in * width_in; scalar_t *data_col_ptr = data_col + index; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = group * group_channels; opmath_t col = 0; const scalar_t *data_im_ptr = data_im + b_col * input_size * qid_stride; // top-left const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { col += dcnv3_im2col_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } // debug template __global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { __shared__ opmath_t cache_grad_offset[blockSize * 2]; __shared__ opmath_t cache_grad_mask[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); if (tid == 0) { opmath_t _grad_w = cache_grad_offset[0], _grad_h = cache_grad_offset[1], _grad_a = cache_grad_mask[0]; int sid = 2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_offset[sid]; _grad_h += cache_grad_offset[sid + 1]; _grad_a += cache_grad_mask[tid]; sid += 2; } *grad_offset = _grad_w; *(grad_offset + 1) = _grad_h; *grad_mask = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { __shared__ opmath_t cache_grad_offset[blockSize * 2]; __shared__ opmath_t cache_grad_mask[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_mask[tid] += cache_grad_mask[tid + s]; cache_grad_offset[xid1] += cache_grad_offset[xid2]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_offset = cache_grad_offset[0]; *(grad_offset + 1) = cache_grad_offset[1]; *grad_mask = cache_grad_mask[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v1( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { extern __shared__ int _s[]; opmath_t *cache_grad_offset = (opmath_t *)_s; opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); if (tid == 0) { opmath_t _grad_w = cache_grad_offset[0], _grad_h = cache_grad_offset[1], _grad_a = cache_grad_mask[0]; int sid = 2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_offset[sid]; _grad_h += cache_grad_offset[sid + 1]; _grad_a += cache_grad_mask[tid]; sid += 2; } *grad_offset = _grad_w; *(grad_offset + 1) = _grad_h; *grad_mask = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { extern __shared__ int _s[]; opmath_t *cache_grad_offset = (opmath_t *)_s; opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_mask[tid] += cache_grad_mask[tid + s]; cache_grad_offset[xid1] += cache_grad_offset[xid2]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_mask[tid] += cache_grad_mask[tid + (s << 1)]; cache_grad_offset[xid1] += cache_grad_offset[xid2 + (s << 1)]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_offset = cache_grad_offset[0]; *(grad_offset + 1) = cache_grad_offset[1]; *grad_mask = cache_grad_mask[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { extern __shared__ int _s[]; opmath_t *cache_grad_offset = (opmath_t *)_s; opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_mask[tid] += cache_grad_mask[tid + s]; cache_grad_offset[xid1] += cache_grad_offset[xid2]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_mask[tid] += cache_grad_mask[tid + (s << 1)]; cache_grad_offset[xid1] += cache_grad_offset[xid2 + (s << 1)]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_offset, cache_grad_offset[0]); atomicAdd(grad_offset + 1, cache_grad_offset[1]); atomicAdd(grad_mask, cache_grad_mask[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_gm( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear_gm( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, grad_offset, grad_mask); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int batch_n, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale) { const int num_kernels = batch_n * height_out * width_out * group * group_channels; const int num_actual_kernels = batch_n * height_out * width_out * group * group_channels; const int num_threads = CUDA_NUM_THREADS; dcnv3_im2col_gpu_kernel <<>>(num_kernels, data_im, data_offset, data_mask, data_col, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in dcnv3_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int batch_n, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { const int num_threads = (group_channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : group_channels; const int num_kernels = batch_n * height_out * width_out * group * group_channels; const int num_actual_kernels = batch_n * height_out * width_out * group * group_channels; if (group_channels > 1024) { if ((group_channels & 1023) == 0) { dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } else { dcnv3_col2im_gpu_kernel_gm <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } } else { switch (group_channels) { case 1: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 2: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 4: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 8: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 16: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 32: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 64: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 128: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 256: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 512: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 1024: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; default: if (group_channels < 64) { dcnv3_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } else { dcnv3_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/dcnv3.h ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include "cpu/dcnv3_cpu.h" #ifdef WITH_CUDA #include "cuda/dcnv3_cuda.h" #endif at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector dcnv3_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/src/vision.cpp ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include "dcnv3.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward"); m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward"); } ================================================ FILE: yolo-improve/yolov5-DCNV3/ops_dcnv3/test.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn import math from torch.autograd import gradcheck from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch H_in, W_in = 8, 8 N, M, D = 2, 4, 16 Kh, Kw = 3, 3 P = Kh * Kw offset_scale = 2.0 pad = 1 dilation = 1 stride = 1 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask /= mask.sum(-1, keepdim=True) mask = mask.reshape(N, H_out, W_out, M*P) output_pytorch = dcnv3_core_pytorch( input.double(), offset.double(), mask.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu() im2col_step = 2 output_cuda = DCNv3Function.apply( input.double(), offset.double(), mask.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print('>>> forward double') print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask /= mask.sum(-1, keepdim=True) mask = mask.reshape(N, H_out, W_out, M*P) output_pytorch = dcnv3_core_pytorch( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu() im2col_step = 2 output_cuda = DCNv3Function.apply( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print('>>> forward float') print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 M = 2 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 D = channels input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask0 /= mask0.sum(-1, keepdim=True) mask0 = mask0.reshape(N, H_out, W_out, M*P) input0.requires_grad = grad_input offset0.requires_grad = grad_offset mask0.requires_grad = grad_mask output_pytorch = dcnv3_core_pytorch( input0.double(), offset0.double(), mask0.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale) output_pytorch.sum().backward() input1 = input0.detach() offset1 = offset0.detach() mask1 = mask0.detach() input1.requires_grad = grad_input offset1.requires_grad = grad_offset mask1.requires_grad = grad_mask im2col_step = 2 output_cuda = DCNv3Function.apply( input1.double(), offset1.double(), mask1.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step) output_cuda.sum().backward() print(f'>>> backward double: channels {D}') bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (input0.grad - input1.grad).abs().max() max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max() print( f'* {bwdok} input_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (offset0.grad - offset1.grad).abs().max() max_rel_err = ((offset0.grad - offset1.grad).abs() / offset0.grad.abs()).max() print( f'* {bwdok} offset_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (mask0.grad - mask1.grad).abs().max() max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max() print( f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_backward_equal_with_pytorch_float(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 M = 2 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 D = channels input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask0 /= mask0.sum(-1, keepdim=True) mask0 = mask0.reshape(N, H_out, W_out, M*P) input0.requires_grad = grad_input offset0.requires_grad = grad_offset mask0.requires_grad = grad_mask output_pytorch = dcnv3_core_pytorch( input0, offset0, mask0, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale) output_pytorch.sum().backward() input1 = input0.detach() offset1 = offset0.detach() mask1 = mask0.detach() input1.requires_grad = grad_input offset1.requires_grad = grad_offset mask1.requires_grad = grad_mask im2col_step = 2 output_cuda = DCNv3Function.apply( input1, offset1, mask1, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step) output_cuda.sum().backward() print(f'>>> backward float: channels {D}') bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (input0.grad - input1.grad).abs().max() max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max() print( f'* {bwdok} input_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (offset0.grad - offset1.grad).abs().max() max_rel_err = ((offset0.grad - offset1.grad).abs() / offset0.grad.abs()).max() print( f'* {bwdok} offset_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (mask0.grad - mask1.grad).abs().max() max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max() print( f'* {bwdok} mask_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_time_cost(im2col_step=128): N = 512 H_in, W_in = 64, 64 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask /= mask.sum(-1, keepdim=True) mask = mask.reshape(N, H_out, W_out, M*P) print( f'>>> time cost: im2col_step {im2col_step}; input {input.shape}; points {P} ') repeat = 100 for i in range(repeat): output_cuda = DCNv3Function.apply( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0, im2col_step) torch.cuda.synchronize() start = time.time() for i in range(repeat): output_cuda = DCNv3Function.apply( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0, im2col_step) torch.cuda.synchronize() print(f'foward time cost: {(time.time() - start) / repeat}') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_double(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_float(channels, True, True, True) for i in range(3): im2col_step = 128 * (2 ** i) check_time_cost(im2col_step) ================================================ FILE: yolo-improve/yolov5-DSConv.py ================================================ import torch.nn.functional as F from torch.nn.modules.conv import _ConvNd from torch.nn.modules.utils import _pair class DSConv(_ConvNd): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1, padding_mode='zeros', bias=False, block_size=32, KDSBias=False, CDS=False): padding = _pair(autopad(kernel_size, padding, dilation)) kernel_size = _pair(kernel_size) stride = _pair(stride) dilation = _pair(dilation) blck_numb = math.ceil(((in_channels)/(block_size*groups))) super(DSConv, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias, padding_mode) # KDS weight From Paper self.intweight = torch.Tensor(out_channels, in_channels, *kernel_size) self.alpha = torch.Tensor(out_channels, blck_numb, *kernel_size) # KDS bias From Paper self.KDSBias = KDSBias self.CDS = CDS if KDSBias: self.KDSb = torch.Tensor(out_channels, blck_numb, *kernel_size) if CDS: self.CDSw = torch.Tensor(out_channels) self.CDSb = torch.Tensor(out_channels) self.reset_parameters() def get_weight_res(self): # Include expansion of alpha and multiplication with weights to include in the convolution layer here alpha_res = torch.zeros(self.weight.shape).to(self.alpha.device) # Include KDSBias if self.KDSBias: KDSBias_res = torch.zeros(self.weight.shape).to(self.alpha.device) # Handy definitions: nmb_blocks = self.alpha.shape[1] total_depth = self.weight.shape[1] bs = total_depth//nmb_blocks llb = total_depth-(nmb_blocks-1)*bs # Casting the Alpha values as same tensor shape as weight for i in range(nmb_blocks): length_blk = llb if i==nmb_blocks-1 else bs shp = self.alpha.shape # Notice this is the same shape for the bias as well to_repeat=self.alpha[:, i, ...].view(shp[0],1,shp[2],shp[3]).clone() repeated = to_repeat.expand(shp[0], length_blk, shp[2], shp[3]).clone() alpha_res[:, i*bs:(i*bs+length_blk), ...] = repeated.clone() if self.KDSBias: to_repeat = self.KDSb[:, i, ...].view(shp[0], 1, shp[2], shp[3]).clone() repeated = to_repeat.expand(shp[0], length_blk, shp[2], shp[3]).clone() KDSBias_res[:, i*bs:(i*bs+length_blk), ...] = repeated.clone() if self.CDS: to_repeat = self.CDSw.view(-1, 1, 1, 1) repeated = to_repeat.expand_as(self.weight) print(repeated.shape) # Element-wise multiplication of alpha and weight weight_res = torch.mul(alpha_res, self.weight) if self.KDSBias: weight_res = torch.add(weight_res, KDSBias_res) return weight_res def forward(self, input): # Get resulting weight #weight_res = self.get_weight_res() # Returning convolution return F.conv2d(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) class DSConv2D(Conv): def __init__(self, inc, ouc, k=1, s=1, p=None, g=1, d=1, act=True): super().__init__(inc, ouc, k, s, p, g, d, act) self.conv = DSConv(inc, ouc, k, s, p, g, d) class Bottleneck_DSConv(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = DSConv2D(c1, c_, 1, 1) self.cv2 = DSConv2D(c_, c2, 3, 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3_DSConv(C3): # C3 module with dsconv def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = nn.Sequential(*(Bottleneck_DSConv(c_, c_, shortcut, g, e=1.0) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-DecoupledHead.py ================================================ class Decoupled_Detect(nn.Module): # YOLOv5 Detect head for detection models stride = None # strides computed during build dynamic = False # force grid reconstruction export = False # export mode def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer super().__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.empty(0) for _ in range(self.nl)] # init grid self.anchor_grid = [torch.empty(0) for _ in range(self.nl)] # init anchor grid self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2) self.m_stem = nn.ModuleList(Conv(x, x, 1) for x in ch) # stem conv self.m_cls = nn.ModuleList(nn.Sequential(Conv(x, x, 3), nn.Conv2d(x, self.na * self.nc, 1)) for x in ch) # cls conv self.m_reg_conf = nn.ModuleList(Conv(x, x, 3) for x in ch) # reg_conf stem conv self.m_reg = nn.ModuleList(nn.Conv2d(x, self.na * 4, 1) for x in ch) # reg conv self.m_conf = nn.ModuleList(nn.Conv2d(x, self.na * 1, 1) for x in ch) # conf conv self.inplace = inplace # use inplace ops (e.g. slice assignment) def forward(self, x): z = [] # inference output for i in range(self.nl): x[i] = self.m_stem[i](x[i]) # conv bs, _, ny, nx = x[i].shape x_cls = self.m_cls[i](x[i]).view(bs, self.na, self.nc, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_reg_conf = self.m_reg_conf[i](x[i]) x_reg = self.m_reg[i](x_reg_conf).view(bs, self.na, 4, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_conf = self.m_conf[i](x_reg_conf).view(bs, self.na, 1, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i] = torch.cat([x_reg, x_conf, x_cls], dim=4) if not self.training: # inference if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i) if isinstance(self, Segment): # (boxes + masks) xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4) xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf.sigmoid(), mask), 4) else: # Detect (boxes only) xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4) xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, self.na * nx * ny, self.no)) return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x) def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')): d = self.anchors[i].device t = self.anchors[i].dtype shape = 1, self.na, ny, nx, 2 # grid shape y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t) yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x) # torch>=0.7 compatibility grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5 anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape) return grid, anchor_grid def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency # https://arxiv.org/abs/1708.02002 section 3.3 # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1. m = self.model[-1] # Detect() module if isinstance(m, Detect): for mi, s in zip(m.m, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) b.data[:, 5:5 + m.nc] += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) elif isinstance(m, Decoupled_Detect): for mi, s in zip(m.m_conf, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) for mi, s in zip(m.m_cls, m.stride): # from b = mi[-1].bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls mi[-1].bias = torch.nn.Parameter(b.view(-1), requires_grad=True) ================================================ FILE: yolo-improve/yolov5-DySnakeConv.py ================================================ import torch import torch.nn as nn class DySnakeConv(nn.Module): def __init__(self, inc, ouc, k=3, act=True) -> None: super().__init__() self.conv_0 = Conv(inc, ouc, k, act=act) self.conv_x = DSConv(inc, ouc, 0, k) self.conv_y = DSConv(inc, ouc, 1, k) self.conv_1x1 = Conv(ouc * 3, ouc, 1, act=act) def forward(self, x): return self.conv_1x1(torch.cat([self.conv_0(x), self.conv_x(x), self.conv_y(x)], dim=1)) class DSConv(nn.Module): def __init__(self, in_ch, out_ch, morph, kernel_size=3, if_offset=True, extend_scope=1): """ The Dynamic Snake Convolution :param in_ch: input channel :param out_ch: output channel :param kernel_size: the size of kernel :param extend_scope: the range to expand (default 1 for this method) :param morph: the morphology of the convolution kernel is mainly divided into two types along the x-axis (0) and the y-axis (1) (see the paper for details) :param if_offset: whether deformation is required, if it is False, it is the standard convolution kernel """ super(DSConv, self).__init__() # use the to learn the deformable offset self.offset_conv = nn.Conv2d(in_ch, 2 * kernel_size, 3, padding=1) self.bn = nn.BatchNorm2d(2 * kernel_size) self.kernel_size = kernel_size # two types of the DSConv (along x-axis and y-axis) self.dsc_conv_x = nn.Conv2d( in_ch, out_ch, kernel_size=(kernel_size, 1), stride=(kernel_size, 1), padding=0, ) self.dsc_conv_y = nn.Conv2d( in_ch, out_ch, kernel_size=(1, kernel_size), stride=(1, kernel_size), padding=0, ) self.gn = nn.GroupNorm(out_ch // 4, out_ch) self.act = Conv.default_act self.extend_scope = extend_scope self.morph = morph self.if_offset = if_offset def forward(self, f): offset = self.offset_conv(f) offset = self.bn(offset) # We need a range of deformation between -1 and 1 to mimic the snake's swing offset = torch.tanh(offset) input_shape = f.shape dsc = DSC(input_shape, self.kernel_size, self.extend_scope, self.morph) deformed_feature = dsc.deform_conv(f, offset, self.if_offset) if self.morph == 0: x = self.dsc_conv_x(deformed_feature.type(f.dtype)) x = self.gn(x) x = self.act(x) return x else: x = self.dsc_conv_y(deformed_feature.type(f.dtype)) x = self.gn(x) x = self.act(x) return x # Core code, for ease of understanding, we mark the dimensions of input and output next to the code class DSC(object): def __init__(self, input_shape, kernel_size, extend_scope, morph): self.num_points = kernel_size self.width = input_shape[2] self.height = input_shape[3] self.morph = morph self.extend_scope = extend_scope # offset (-1 ~ 1) * extend_scope # define feature map shape """ B: Batch size C: Channel W: Width H: Height """ self.num_batch = input_shape[0] self.num_channels = input_shape[1] """ input: offset [B,2*K,W,H] K: Kernel size (2*K: 2D image, deformation contains and ) output_x: [B,1,W,K*H] coordinate map output_y: [B,1,K*W,H] coordinate map """ def _coordinate_map_3D(self, offset, if_offset): device = offset.device # offset y_offset, x_offset = torch.split(offset, self.num_points, dim=1) y_center = torch.arange(0, self.width).repeat([self.height]) y_center = y_center.reshape(self.height, self.width) y_center = y_center.permute(1, 0) y_center = y_center.reshape([-1, self.width, self.height]) y_center = y_center.repeat([self.num_points, 1, 1]).float() y_center = y_center.unsqueeze(0) x_center = torch.arange(0, self.height).repeat([self.width]) x_center = x_center.reshape(self.width, self.height) x_center = x_center.permute(0, 1) x_center = x_center.reshape([-1, self.width, self.height]) x_center = x_center.repeat([self.num_points, 1, 1]).float() x_center = x_center.unsqueeze(0) if self.morph == 0: """ Initialize the kernel and flatten the kernel y: only need 0 x: -num_points//2 ~ num_points//2 (Determined by the kernel size) !!! The related PPT will be submitted later, and the PPT will contain the whole changes of each step """ y = torch.linspace(0, 0, 1) x = torch.linspace( -int(self.num_points // 2), int(self.num_points // 2), int(self.num_points), ) y, x = torch.meshgrid(y, x) y_spread = y.reshape(-1, 1) x_spread = x.reshape(-1, 1) y_grid = y_spread.repeat([1, self.width * self.height]) y_grid = y_grid.reshape([self.num_points, self.width, self.height]) y_grid = y_grid.unsqueeze(0) # [B*K*K, W,H] x_grid = x_spread.repeat([1, self.width * self.height]) x_grid = x_grid.reshape([self.num_points, self.width, self.height]) x_grid = x_grid.unsqueeze(0) # [B*K*K, W,H] y_new = y_center + y_grid x_new = x_center + x_grid y_new = y_new.repeat(self.num_batch, 1, 1, 1).to(device) x_new = x_new.repeat(self.num_batch, 1, 1, 1).to(device) y_offset_new = y_offset.detach().clone() if if_offset: y_offset = y_offset.permute(1, 0, 2, 3) y_offset_new = y_offset_new.permute(1, 0, 2, 3) center = int(self.num_points // 2) # The center position remains unchanged and the rest of the positions begin to swing # This part is quite simple. The main idea is that "offset is an iterative process" y_offset_new[center] = 0 for index in range(1, center): y_offset_new[center + index] = (y_offset_new[center + index - 1] + y_offset[center + index]) y_offset_new[center - index] = (y_offset_new[center - index + 1] + y_offset[center - index]) y_offset_new = y_offset_new.permute(1, 0, 2, 3).to(device) y_new = y_new.add(y_offset_new.mul(self.extend_scope)) y_new = y_new.reshape( [self.num_batch, self.num_points, 1, self.width, self.height]) y_new = y_new.permute(0, 3, 1, 4, 2) y_new = y_new.reshape([ self.num_batch, self.num_points * self.width, 1 * self.height ]) x_new = x_new.reshape( [self.num_batch, self.num_points, 1, self.width, self.height]) x_new = x_new.permute(0, 3, 1, 4, 2) x_new = x_new.reshape([ self.num_batch, self.num_points * self.width, 1 * self.height ]) return y_new, x_new else: """ Initialize the kernel and flatten the kernel y: -num_points//2 ~ num_points//2 (Determined by the kernel size) x: only need 0 """ y = torch.linspace( -int(self.num_points // 2), int(self.num_points // 2), int(self.num_points), ) x = torch.linspace(0, 0, 1) y, x = torch.meshgrid(y, x) y_spread = y.reshape(-1, 1) x_spread = x.reshape(-1, 1) y_grid = y_spread.repeat([1, self.width * self.height]) y_grid = y_grid.reshape([self.num_points, self.width, self.height]) y_grid = y_grid.unsqueeze(0) x_grid = x_spread.repeat([1, self.width * self.height]) x_grid = x_grid.reshape([self.num_points, self.width, self.height]) x_grid = x_grid.unsqueeze(0) y_new = y_center + y_grid x_new = x_center + x_grid y_new = y_new.repeat(self.num_batch, 1, 1, 1) x_new = x_new.repeat(self.num_batch, 1, 1, 1) y_new = y_new.to(device) x_new = x_new.to(device) x_offset_new = x_offset.detach().clone() if if_offset: x_offset = x_offset.permute(1, 0, 2, 3) x_offset_new = x_offset_new.permute(1, 0, 2, 3) center = int(self.num_points // 2) x_offset_new[center] = 0 for index in range(1, center): x_offset_new[center + index] = (x_offset_new[center + index - 1] + x_offset[center + index]) x_offset_new[center - index] = (x_offset_new[center - index + 1] + x_offset[center - index]) x_offset_new = x_offset_new.permute(1, 0, 2, 3).to(device) x_new = x_new.add(x_offset_new.mul(self.extend_scope)) y_new = y_new.reshape( [self.num_batch, 1, self.num_points, self.width, self.height]) y_new = y_new.permute(0, 3, 1, 4, 2) y_new = y_new.reshape([ self.num_batch, 1 * self.width, self.num_points * self.height ]) x_new = x_new.reshape( [self.num_batch, 1, self.num_points, self.width, self.height]) x_new = x_new.permute(0, 3, 1, 4, 2) x_new = x_new.reshape([ self.num_batch, 1 * self.width, self.num_points * self.height ]) return y_new, x_new """ input: input feature map [N,C,D,W,H];coordinate map [N,K*D,K*W,K*H] output: [N,1,K*D,K*W,K*H] deformed feature map """ def _bilinear_interpolate_3D(self, input_feature, y, x): device = input_feature.device y = y.reshape([-1]).float() x = x.reshape([-1]).float() zero = torch.zeros([]).int() max_y = self.width - 1 max_x = self.height - 1 # find 8 grid locations y0 = torch.floor(y).int() y1 = y0 + 1 x0 = torch.floor(x).int() x1 = x0 + 1 # clip out coordinates exceeding feature map volume y0 = torch.clamp(y0, zero, max_y) y1 = torch.clamp(y1, zero, max_y) x0 = torch.clamp(x0, zero, max_x) x1 = torch.clamp(x1, zero, max_x) input_feature_flat = input_feature.flatten() input_feature_flat = input_feature_flat.reshape( self.num_batch, self.num_channels, self.width, self.height) input_feature_flat = input_feature_flat.permute(0, 2, 3, 1) input_feature_flat = input_feature_flat.reshape(-1, self.num_channels) dimension = self.height * self.width base = torch.arange(self.num_batch) * dimension base = base.reshape([-1, 1]).float() repeat = torch.ones([self.num_points * self.width * self.height ]).unsqueeze(0) repeat = repeat.float() base = torch.matmul(base, repeat) base = base.reshape([-1]) base = base.to(device) base_y0 = base + y0 * self.height base_y1 = base + y1 * self.height # top rectangle of the neighbourhood volume index_a0 = base_y0 - base + x0 index_c0 = base_y0 - base + x1 # bottom rectangle of the neighbourhood volume index_a1 = base_y1 - base + x0 index_c1 = base_y1 - base + x1 # get 8 grid values value_a0 = input_feature_flat[index_a0.type(torch.int64)].to(device) value_c0 = input_feature_flat[index_c0.type(torch.int64)].to(device) value_a1 = input_feature_flat[index_a1.type(torch.int64)].to(device) value_c1 = input_feature_flat[index_c1.type(torch.int64)].to(device) # find 8 grid locations y0 = torch.floor(y).int() y1 = y0 + 1 x0 = torch.floor(x).int() x1 = x0 + 1 # clip out coordinates exceeding feature map volume y0 = torch.clamp(y0, zero, max_y + 1) y1 = torch.clamp(y1, zero, max_y + 1) x0 = torch.clamp(x0, zero, max_x + 1) x1 = torch.clamp(x1, zero, max_x + 1) x0_float = x0.float() x1_float = x1.float() y0_float = y0.float() y1_float = y1.float() vol_a0 = ((y1_float - y) * (x1_float - x)).unsqueeze(-1).to(device) vol_c0 = ((y1_float - y) * (x - x0_float)).unsqueeze(-1).to(device) vol_a1 = ((y - y0_float) * (x1_float - x)).unsqueeze(-1).to(device) vol_c1 = ((y - y0_float) * (x - x0_float)).unsqueeze(-1).to(device) outputs = (value_a0 * vol_a0 + value_c0 * vol_c0 + value_a1 * vol_a1 + value_c1 * vol_c1) if self.morph == 0: outputs = outputs.reshape([ self.num_batch, self.num_points * self.width, 1 * self.height, self.num_channels, ]) outputs = outputs.permute(0, 3, 1, 2) else: outputs = outputs.reshape([ self.num_batch, 1 * self.width, self.num_points * self.height, self.num_channels, ]) outputs = outputs.permute(0, 3, 1, 2) return outputs def deform_conv(self, input, offset, if_offset): y, x = self._coordinate_map_3D(offset, if_offset) deformed_feature = self._bilinear_interpolate_3D(input, y, x) return deformed_feature #### YOLOV5 class Bottleneck_DySnake(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = DySnakeConv(c_, c2, 3) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3_DySnake(C3): # C3 module with DySnakeConv def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = nn.Sequential(*(Bottleneck_DySnake(c_, c_, shortcut, g, e=1.0) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-EVC.py ================================================ import torch.nn.functional as F from functools import partial from timm.models.layers import DropPath, trunc_normal_ # LVC class Encoding(nn.Module): def __init__(self, in_channels, num_codes): super(Encoding, self).__init__() # init codewords and smoothing factor self.in_channels, self.num_codes = in_channels, num_codes num_codes = 64 std = 1. / ((num_codes * in_channels)**0.5) # [num_codes, channels] self.codewords = nn.Parameter( torch.empty(num_codes, in_channels, dtype=torch.float).uniform_(-std, std), requires_grad=True) # [num_codes] self.scale = nn.Parameter(torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0), requires_grad=True) @staticmethod def scaled_l2(x, codewords, scale): num_codes, in_channels = codewords.size() b = x.size(0) expanded_x = x.unsqueeze(2).expand((b, x.size(1), num_codes, in_channels)) # ---处理codebook (num_code, c1) reshaped_codewords = codewords.view((1, 1, num_codes, in_channels)) # 把scale从1, num_code变成 batch, c2, N, num_codes reshaped_scale = scale.view((1, 1, num_codes)) # N, num_codes # ---计算rik = z1 - d # b, N, num_codes scaled_l2_norm = reshaped_scale * (expanded_x - reshaped_codewords).pow(2).sum(dim=3) return scaled_l2_norm @staticmethod def aggregate(assignment_weights, x, codewords): num_codes, in_channels = codewords.size() # ---处理codebook reshaped_codewords = codewords.view((1, 1, num_codes, in_channels)) b = x.size(0) # ---处理特征向量x b, c1, N expanded_x = x.unsqueeze(2).expand((b, x.size(1), num_codes, in_channels)) #变换rei b, N, num_codes,- assignment_weights = assignment_weights.unsqueeze(3) # b, N, num_codes, # ---开始计算eik,必须在Rei计算完之后 encoded_feat = (assignment_weights * (expanded_x - reshaped_codewords)).sum(1) return encoded_feat def forward(self, x): assert x.dim() == 4 and x.size(1) == self.in_channels b, in_channels, w, h = x.size() # [batch_size, height x width, channels] x = x.view(b, self.in_channels, -1).transpose(1, 2).contiguous() # assignment_weights: [batch_size, channels, num_codes] assignment_weights = torch.softmax(self.scaled_l2(x, self.codewords, self.scale), dim=2) # aggregate encoded_feat = self.aggregate(assignment_weights, x, self.codewords) return encoded_feat class Mlp(nn.Module): """ Implementation of MLP with 1*1 convolutions. Input: tensor with shape [B, C, H, W] """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.act = act_layer() self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Conv2d): trunc_normal_(m.weight, std=.02) if m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x # 1*1 3*3 1*1 class ConvBlock(nn.Module): def __init__(self, in_channels, out_channels, stride=1, res_conv=False, act_layer=nn.ReLU, groups=1, norm_layer=partial(nn.BatchNorm2d, eps=1e-6)): super(ConvBlock, self).__init__() self.in_channels = in_channels expansion = 4 c = out_channels // expansion self.conv1 = Conv(in_channels, c, act=nn.ReLU()) self.conv2 = Conv(c, c, k=3, s=stride, g=groups, act=nn.ReLU()) self.conv3 = Conv(c, out_channels, 1, act=False) self.act3 = act_layer(inplace=True) if res_conv: self.residual_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False) self.residual_bn = norm_layer(out_channels) self.res_conv = res_conv def zero_init_last_bn(self): nn.init.zeros_(self.bn3.weight) def forward(self, x, return_x_2=True): residual = x x = self.conv1(x) x2 = self.conv2(x) #if x_t_r is None else self.conv2(x + x_t_r) x = self.conv3(x2) if self.res_conv: residual = self.residual_conv(residual) residual = self.residual_bn(residual) x += residual x = self.act3(x) if return_x_2: return x, x2 else: return x class Mean(nn.Module): def __init__(self, dim, keep_dim=False): super(Mean, self).__init__() self.dim = dim self.keep_dim = keep_dim def forward(self, input): return input.mean(self.dim, self.keep_dim) class LVCBlock(nn.Module): def __init__(self, in_channels, out_channels, num_codes, channel_ratio=0.25, base_channel=64): super(LVCBlock, self).__init__() self.out_channels = out_channels self.num_codes = num_codes num_codes = 64 self.conv_1 = ConvBlock(in_channels=in_channels, out_channels=in_channels, res_conv=True, stride=1) self.LVC = nn.Sequential( Conv(in_channels, in_channels, 1, act=nn.ReLU()), Encoding(in_channels=in_channels, num_codes=num_codes), nn.BatchNorm1d(num_codes), nn.ReLU(inplace=True), Mean(dim=1)) self.fc = nn.Sequential(nn.Linear(in_channels, in_channels), nn.Sigmoid()) def forward(self, x): x = self.conv_1(x, return_x_2=False) en = self.LVC(x) gam = self.fc(en) b, in_channels, _, _ = x.size() y = gam.view(b, in_channels, 1, 1) x = F.relu_(x + x * y) return x class GroupNorm(nn.GroupNorm): """ Group Normalization with 1 group. Input: tensor in shape [B, C, H, W] """ def __init__(self, num_channels, **kwargs): super().__init__(1, num_channels, **kwargs) class DWConv_LMLP(nn.Module): """Depthwise Conv + Conv""" def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"): super().__init__() self.dconv = Conv( in_channels, in_channels, k=ksize, s=stride, g=in_channels, ) self.pconv = Conv( in_channels, out_channels, k=1, s=1, g=1 ) def forward(self, x): x = self.dconv(x) return self.pconv(x) # LightMLPBlock class LightMLPBlock(nn.Module): def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu", mlp_ratio=4., drop=0., act_layer=nn.GELU, use_layer_scale=True, layer_scale_init_value=1e-5, drop_path=0., norm_layer=GroupNorm): # act_layer=nn.GELU, super().__init__() self.dw = DWConv_LMLP(in_channels, out_channels, ksize=1, stride=1, act="silu") self.linear = nn.Linear(out_channels, out_channels) # learnable position embedding self.out_channels = out_channels self.norm1 = norm_layer(in_channels) self.norm2 = norm_layer(in_channels) mlp_hidden_dim = int(in_channels * mlp_ratio) self.mlp = Mlp(in_features=in_channels, hidden_features=mlp_hidden_dim, act_layer=nn.GELU, drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.use_layer_scale = use_layer_scale if use_layer_scale: self.layer_scale_1 = nn.Parameter( layer_scale_init_value * torch.ones((out_channels)), requires_grad=True) self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones((out_channels)), requires_grad=True) def forward(self, x): if self.use_layer_scale: x = x + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.dw(self.norm1(x))) x = x + self.drop_path(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.dw(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x # EVCBlock class EVCBlock(nn.Module): def __init__(self, in_channels, out_channels, channel_ratio=4, base_channel=16): super().__init__() expansion = 2 ch = out_channels * expansion # Stem stage: get the feature maps by conv block (copied form resnet.py) 进入conformer框架之前的处理 self.conv1 = Conv(in_channels, in_channels, k=7, act=nn.ReLU()) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) # 1 / 4 [56, 56] # LVC self.lvc = LVCBlock(in_channels=in_channels, out_channels=out_channels, num_codes=64) # c1值暂时未定 # LightMLPBlock self.l_MLP = LightMLPBlock(in_channels, out_channels, ksize=1, stride=1, act="silu", act_layer=nn.GELU, mlp_ratio=4., drop=0., use_layer_scale=True, layer_scale_init_value=1e-5, drop_path=0., norm_layer=GroupNorm) self.cnv1 = nn.Conv2d(ch, out_channels, kernel_size=1, stride=1, padding=0) def forward(self, x): x1 = self.maxpool((self.conv1(x))) # LVCBlock x_lvc = self.lvc(x1) # LightMLPBlock x_lmlp = self.l_MLP(x1) # concat x = torch.cat((x_lvc, x_lmlp), dim=1) x = self.cnv1(x) return x elif m is EVCBlock: c2 = ch[f] args = [c2, c2] ================================================ FILE: yolo-improve/yolov5-FasterBlock.py ================================================ from timm.models.layers import DropPath class Partial_conv3(nn.Module): def __init__(self, dim, n_div, forward): super().__init__() self.dim_conv3 = dim // n_div self.dim_untouched = dim - self.dim_conv3 self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, 3, 1, 1, bias=False) if forward == 'slicing': self.forward = self.forward_slicing elif forward == 'split_cat': self.forward = self.forward_split_cat else: raise NotImplementedError def forward_slicing(self, x): # only for inference x = x.clone() # !!! Keep the original input intact for the residual connection later x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :]) return x def forward_split_cat(self, x): # for training/inference x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1) x1 = self.partial_conv3(x1) x = torch.cat((x1, x2), 1) return x class Faster_Block(nn.Module): def __init__(self, inc, dim, n_div=4, mlp_ratio=2, drop_path=0.1, layer_scale_init_value=0.0, pconv_fw_type='split_cat' ): super().__init__() self.dim = dim self.mlp_ratio = mlp_ratio self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.n_div = n_div mlp_hidden_dim = int(dim * mlp_ratio) mlp_layer = [ Conv(dim, mlp_hidden_dim, 1), nn.Conv2d(mlp_hidden_dim, dim, 1, bias=False) ] self.mlp = nn.Sequential(*mlp_layer) self.spatial_mixing = Partial_conv3( dim, n_div, pconv_fw_type ) self.adjust_channel = None if inc != dim: self.adjust_channel = Conv(inc, dim, 1) if layer_scale_init_value > 0: self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.forward = self.forward_layer_scale else: self.forward = self.forward def forward(self, x): if self.adjust_channel is not None: x = self.adjust_channel(x) shortcut = x x = self.spatial_mixing(x) x = shortcut + self.drop_path(self.mlp(x)) return x def forward_layer_scale(self, x): shortcut = x x = self.spatial_mixing(x) x = shortcut + self.drop_path( self.layer_scale.unsqueeze(-1).unsqueeze(-1) * self.mlp(x)) return x class C3_Faster(C3): # C3 module with cross-convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) self.m = nn.Sequential(*(Faster_Block(c_, c_) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-GFPN/extra_modules.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1): '''Basic cell for rep-style block, including conv and bn''' result = nn.Sequential() result.add_module( 'conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)) result.add_module('bn', nn.BatchNorm2d(num_features=out_channels)) return result class RepConv(nn.Module): '''RepConv is a basic rep-style block, including training and deploy status Code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py ''' def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, padding_mode='zeros', deploy=False, act='relu', norm=None): super(RepConv, self).__init__() self.deploy = deploy self.groups = groups self.in_channels = in_channels self.out_channels = out_channels assert kernel_size == 3 assert padding == 1 padding_11 = padding - kernel_size // 2 if isinstance(act, str): self.nonlinearity = get_activation(act) else: self.nonlinearity = act if deploy: self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode) else: self.rbr_identity = None self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups) self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups) def forward(self, inputs): '''Forward process''' if hasattr(self, 'rbr_reparam'): return self.nonlinearity(self.rbr_reparam(inputs)) if self.rbr_identity is None: id_out = 0 else: id_out = self.rbr_identity(inputs) return self.nonlinearity( self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out) def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) return kernel3x3 + self._pad_1x1_to_3x3_tensor( kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 if isinstance(branch, nn.Sequential): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps else: assert isinstance(branch, nn.BatchNorm2d) if not hasattr(self, 'id_tensor'): input_dim = self.in_channels // self.groups kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32) for i in range(self.in_channels): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to( branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def switch_to_deploy(self): if hasattr(self, 'rbr_reparam'): return kernel, bias = self.get_equivalent_kernel_bias() self.rbr_reparam = nn.Conv2d( in_channels=self.rbr_dense.conv.in_channels, out_channels=self.rbr_dense.conv.out_channels, kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride, padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation, groups=self.rbr_dense.conv.groups, bias=True) self.rbr_reparam.weight.data = kernel self.rbr_reparam.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('rbr_dense') self.__delattr__('rbr_1x1') if hasattr(self, 'rbr_identity'): self.__delattr__('rbr_identity') if hasattr(self, 'id_tensor'): self.__delattr__('id_tensor') self.deploy = True class Swish(nn.Module): def __init__(self, inplace=True): super(Swish, self).__init__() self.inplace = inplace def forward(self, x): if self.inplace: x.mul_(F.sigmoid(x)) return x else: return x * F.sigmoid(x) def get_activation(name='silu', inplace=True): if name is None: return nn.Identity() if isinstance(name, str): if name == 'silu': module = nn.SiLU(inplace=inplace) elif name == 'relu': module = nn.ReLU(inplace=inplace) elif name == 'lrelu': module = nn.LeakyReLU(0.1, inplace=inplace) elif name == 'swish': module = Swish(inplace=inplace) elif name == 'hardsigmoid': module = nn.Hardsigmoid(inplace=inplace) elif name == 'identity': module = nn.Identity() else: raise AttributeError('Unsupported act type: {}'.format(name)) return module elif isinstance(name, nn.Module): return name else: raise AttributeError('Unsupported act type: {}'.format(name)) def get_norm(name, out_channels, inplace=True): if name == 'bn': module = nn.BatchNorm2d(out_channels) else: raise NotImplementedError return module class ConvBNAct(nn.Module): """A Conv2d -> Batchnorm -> silu/leaky relu block""" def __init__( self, in_channels, out_channels, ksize, stride=1, groups=1, bias=False, act='silu', norm='bn', reparam=False, ): super().__init__() # same padding pad = (ksize - 1) // 2 self.conv = nn.Conv2d( in_channels, out_channels, kernel_size=ksize, stride=stride, padding=pad, groups=groups, bias=bias, ) if norm is not None: self.bn = get_norm(norm, out_channels, inplace=True) if act is not None: self.act = get_activation(act, inplace=True) self.with_norm = norm is not None self.with_act = act is not None def forward(self, x): x = self.conv(x) if self.with_norm: x = self.bn(x) if self.with_act: x = self.act(x) return x def fuseforward(self, x): return self.act(self.conv(x)) class BasicBlock_3x3_Reverse(nn.Module): def __init__(self, ch_in, ch_hidden_ratio, ch_out, act='relu', shortcut=True): super(BasicBlock_3x3_Reverse, self).__init__() assert ch_in == ch_out ch_hidden = int(ch_in * ch_hidden_ratio) self.conv1 = ConvBNAct(ch_hidden, ch_out, 3, stride=1, act=act) self.conv2 = RepConv(ch_in, ch_hidden, 3, stride=1, act=act) self.shortcut = shortcut def forward(self, x): y = self.conv2(x) y = self.conv1(y) if self.shortcut: return x + y else: return y class SPP(nn.Module): def __init__( self, ch_in, ch_out, k, pool_size, act='swish', ): super(SPP, self).__init__() self.pool = [] for i, size in enumerate(pool_size): pool = nn.MaxPool2d(kernel_size=size, stride=1, padding=size // 2, ceil_mode=False) self.add_module('pool{}'.format(i), pool) self.pool.append(pool) self.conv = ConvBNAct(ch_in, ch_out, k, act=act) def forward(self, x): outs = [x] for pool in self.pool: outs.append(pool(x)) y = torch.cat(outs, axis=1) y = self.conv(y) return y class CSPStage(nn.Module): def __init__(self, ch_in, ch_out, n, block_fn='BasicBlock_3x3_Reverse', ch_hidden_ratio=1.0, act='silu', spp=False): super(CSPStage, self).__init__() split_ratio = 2 ch_first = int(ch_out // split_ratio) ch_mid = int(ch_out - ch_first) self.conv1 = ConvBNAct(ch_in, ch_first, 1, act=act) self.conv2 = ConvBNAct(ch_in, ch_mid, 1, act=act) self.convs = nn.Sequential() next_ch_in = ch_mid for i in range(n): if block_fn == 'BasicBlock_3x3_Reverse': self.convs.add_module( str(i), BasicBlock_3x3_Reverse(next_ch_in, ch_hidden_ratio, ch_mid, act=act, shortcut=True)) else: raise NotImplementedError if i == (n - 1) // 2 and spp: self.convs.add_module( 'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) next_ch_in = ch_mid self.conv3 = ConvBNAct(ch_mid * n + ch_first, ch_out, 1, act=act) def forward(self, x): y1 = self.conv1(x) y2 = self.conv2(x) mid_out = [y1] for conv in self.convs: y2 = conv(y2) mid_out.append(y2) y = torch.cat(mid_out, axis=1) y = self.conv3(y) return y ================================================ FILE: yolo-improve/yolov5-GFPN/yolov5_GFPN.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # DAMO-YOLO GFPN Head head: [[-1, 1, Conv, [512, 1, 1]], # 10 [6, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], [-1, 3, CSPStage, [512]], # 13 [-1, 1, nn.Upsample, [None, 2, 'nearest']], #14 [4, 1, Conv, [256, 3, 2]], # 15 [[14, -1, 6], 1, Concat, [1]], [-1, 3, CSPStage, [512]], # 17 [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], [-1, 3, CSPStage, [256]], # 20 [-1, 1, Conv, [256, 3, 2]], [[-1, 17], 1, Concat, [1]], [-1, 3, CSPStage, [512]], # 23 [17, 1, Conv, [256, 3, 2]], # 24 [23, 1, Conv, [256, 3, 2]], # 25 [[13, 24, -1], 1, Concat, [1]], [-1, 3, CSPStage, [1024]], # 27 [[20, 23, 27], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-GOLDYOLO/common.py ================================================ import torch.nn.functional as F def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1, bias=False): '''Basic cell for rep-style block, including conv and bn''' result = nn.Sequential() result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=bias)) result.add_module('bn', nn.BatchNorm2d(num_features=out_channels)) return result class RepVGGBlock(nn.Module): '''RepVGGBlock is a basic rep-style block, including training and deploy status This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py ''' def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, padding_mode='zeros', deploy=False, use_se=False): super(RepVGGBlock, self).__init__() """ Initialization of the class. Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution kernel_size (int or tuple): Size of the convolving kernel stride (int or tuple, optional): Stride of the convolution. Default: 1 padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 1 dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 padding_mode (string, optional): Default: 'zeros' deploy: Whether to be deploy status or training status. Default: False use_se: Whether to use se. Default: False """ self.deploy = deploy self.groups = groups self.in_channels = in_channels self.out_channels = out_channels assert kernel_size == 3 assert padding == 1 padding_11 = padding - kernel_size // 2 self.nonlinearity = nn.ReLU() if use_se: raise NotImplementedError("se block not supported yet") else: self.se = nn.Identity() if deploy: self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode) else: self.rbr_identity = nn.BatchNorm2d( num_features=in_channels) if out_channels == in_channels and stride == 1 else None self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups) self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups) def forward(self, inputs): '''Forward process''' if hasattr(self, 'rbr_reparam'): return self.nonlinearity(self.se(self.rbr_reparam(inputs))) if self.rbr_identity is None: id_out = 0 else: id_out = self.rbr_identity(inputs) return self.nonlinearity(self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 if isinstance(branch, nn.Sequential): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps else: assert isinstance(branch, nn.BatchNorm2d) if not hasattr(self, 'id_tensor'): input_dim = self.in_channels // self.groups kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32) for i in range(self.in_channels): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def switch_to_deploy(self): if hasattr(self, 'rbr_reparam'): return kernel, bias = self.get_equivalent_kernel_bias() self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels, out_channels=self.rbr_dense.conv.out_channels, kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride, padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation, groups=self.rbr_dense.conv.groups, bias=True) self.rbr_reparam.weight.data = kernel self.rbr_reparam.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('rbr_dense') self.__delattr__('rbr_1x1') if hasattr(self, 'rbr_identity'): self.__delattr__('rbr_identity') if hasattr(self, 'id_tensor'): self.__delattr__('id_tensor') self.deploy = True def onnx_AdaptiveAvgPool2d(x, output_size): stride_size = np.floor(np.array(x.shape[-2:]) / output_size).astype(np.int32) kernel_size = np.array(x.shape[-2:]) - (output_size - 1) * stride_size avg = nn.AvgPool2d(kernel_size=list(kernel_size), stride=list(stride_size)) x = avg(x) return x def get_avg_pool(): if torch.onnx.is_in_onnx_export(): avg_pool = onnx_AdaptiveAvgPool2d else: avg_pool = nn.functional.adaptive_avg_pool2d return avg_pool class SimFusion_3in(nn.Module): def __init__(self, in_channel_list, out_channels): super().__init__() self.cv1 = Conv(in_channel_list[0], out_channels, act=nn.ReLU()) if in_channel_list[0] != out_channels else nn.Identity() self.cv2 = Conv(in_channel_list[1], out_channels, act=nn.ReLU()) if in_channel_list[1] != out_channels else nn.Identity() self.cv3 = Conv(in_channel_list[2], out_channels, act=nn.ReLU()) if in_channel_list[2] != out_channels else nn.Identity() self.cv_fuse = Conv(out_channels * 3, out_channels, act=nn.ReLU()) self.downsample = nn.functional.adaptive_avg_pool2d def forward(self, x): N, C, H, W = x[1].shape output_size = (H, W) if torch.onnx.is_in_onnx_export(): self.downsample = onnx_AdaptiveAvgPool2d output_size = np.array([H, W]) x0 = self.cv1(self.downsample(x[0], output_size)) x1 = self.cv2(x[1]) x2 = self.cv3(F.interpolate(x[2], size=(H, W), mode='bilinear', align_corners=False)) return self.cv_fuse(torch.cat((x0, x1, x2), dim=1)) class SimFusion_4in(nn.Module): def __init__(self): super().__init__() self.avg_pool = nn.functional.adaptive_avg_pool2d def forward(self, x): x_l, x_m, x_s, x_n = x B, C, H, W = x_s.shape output_size = np.array([H, W]) if torch.onnx.is_in_onnx_export(): self.avg_pool = onnx_AdaptiveAvgPool2d x_l = self.avg_pool(x_l, output_size) x_m = self.avg_pool(x_m, output_size) x_n = F.interpolate(x_n, size=(H, W), mode='bilinear', align_corners=False) out = torch.cat([x_l, x_m, x_s, x_n], 1) return out class IFM(nn.Module): def __init__(self, inc, ouc, embed_dim_p=96, fuse_block_num=3) -> None: super().__init__() self.conv = nn.Sequential( Conv(inc, embed_dim_p), *[RepVGGBlock(embed_dim_p, embed_dim_p) for _ in range(fuse_block_num)], Conv(embed_dim_p, sum(ouc)) ) def forward(self, x): return self.conv(x) class h_sigmoid(nn.Module): def __init__(self, inplace=True): super(h_sigmoid, self).__init__() self.relu = nn.ReLU6(inplace=inplace) def forward(self, x): return self.relu(x + 3) / 6 class InjectionMultiSum_Auto_pool(nn.Module): def __init__( self, inp: int, oup: int, global_inp: list, flag: int ) -> None: super().__init__() self.global_inp = global_inp self.flag = flag self.local_embedding = Conv(inp, oup, 1, act=False) self.global_embedding = Conv(global_inp[self.flag], oup, 1, act=False) self.global_act = Conv(global_inp[self.flag], oup, 1, act=False) self.act = h_sigmoid() def forward(self, x): ''' x_g: global features x_l: local features ''' x_l, x_g = x B, C, H, W = x_l.shape g_B, g_C, g_H, g_W = x_g.shape use_pool = H < g_H gloabl_info = x_g.split(self.global_inp, dim=1)[self.flag] local_feat = self.local_embedding(x_l) global_act = self.global_act(gloabl_info) global_feat = self.global_embedding(gloabl_info) if use_pool: avg_pool = get_avg_pool() output_size = np.array([H, W]) sig_act = avg_pool(global_act, output_size) global_feat = avg_pool(global_feat, output_size) else: sig_act = F.interpolate(self.act(global_act), size=(H, W), mode='bilinear', align_corners=False) global_feat = F.interpolate(global_feat, size=(H, W), mode='bilinear', align_corners=False) out = local_feat * sig_act + global_feat return out def get_shape(tensor): shape = tensor.shape if torch.onnx.is_in_onnx_export(): shape = [i.cpu().numpy() for i in shape] return shape class PyramidPoolAgg(nn.Module): def __init__(self, inc, ouc, stride, pool_mode='torch'): super().__init__() self.stride = stride if pool_mode == 'torch': self.pool = nn.functional.adaptive_avg_pool2d elif pool_mode == 'onnx': self.pool = onnx_AdaptiveAvgPool2d self.conv = Conv(inc, ouc) def forward(self, inputs): B, C, H, W = get_shape(inputs[-1]) H = (H - 1) // self.stride + 1 W = (W - 1) // self.stride + 1 output_size = np.array([H, W]) if not hasattr(self, 'pool'): self.pool = nn.functional.adaptive_avg_pool2d if torch.onnx.is_in_onnx_export(): self.pool = onnx_AdaptiveAvgPool2d out = [self.pool(inp, output_size) for inp in inputs] return self.conv(torch.cat(out, dim=1)) def drop_path(x, drop_prob: float = 0., training: bool = False): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the argument. """ if drop_prob == 0. or not training: return x keep_prob = 1 - drop_prob shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) random_tensor.floor_() # binarize output = x.div(keep_prob) * random_tensor return output class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = Conv(in_features, hidden_features, act=False) self.dwconv = nn.Conv2d(hidden_features, hidden_features, 3, 1, 1, bias=True, groups=hidden_features) self.act = nn.ReLU6() self.fc2 = Conv(hidden_features, out_features, act=False) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.dwconv(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) class Attention(torch.nn.Module): def __init__(self, dim, key_dim, num_heads, attn_ratio=4): super().__init__() self.num_heads = num_heads self.scale = key_dim ** -0.5 self.key_dim = key_dim self.nh_kd = nh_kd = key_dim * num_heads # num_head key_dim self.d = int(attn_ratio * key_dim) self.dh = int(attn_ratio * key_dim) * num_heads self.attn_ratio = attn_ratio self.to_q = Conv(dim, nh_kd, 1, act=False) self.to_k = Conv(dim, nh_kd, 1, act=False) self.to_v = Conv(dim, self.dh, 1, act=False) self.proj = torch.nn.Sequential(nn.ReLU6(), Conv(self.dh, dim, act=False)) def forward(self, x): # x (B,N,C) B, C, H, W = get_shape(x) qq = self.to_q(x).reshape(B, self.num_heads, self.key_dim, H * W).permute(0, 1, 3, 2) kk = self.to_k(x).reshape(B, self.num_heads, self.key_dim, H * W) vv = self.to_v(x).reshape(B, self.num_heads, self.d, H * W).permute(0, 1, 3, 2) attn = torch.matmul(qq, kk) attn = attn.softmax(dim=-1) # dim = k xx = torch.matmul(attn, vv) xx = xx.permute(0, 1, 3, 2).reshape(B, self.dh, H, W) xx = self.proj(xx) return xx class top_Block(nn.Module): def __init__(self, dim, key_dim, num_heads, mlp_ratio=4., attn_ratio=2., drop=0., drop_path=0.): super().__init__() self.dim = dim self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.attn = Attention(dim, key_dim=key_dim, num_heads=num_heads, attn_ratio=attn_ratio) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, drop=drop) def forward(self, x1): x1 = x1 + self.drop_path(self.attn(x1)) x1 = x1 + self.drop_path(self.mlp(x1)) return x1 class TopBasicLayer(nn.Module): def __init__(self, embedding_dim, ouc_list, block_num=2, key_dim=8, num_heads=4, mlp_ratio=4., attn_ratio=2., drop=0., attn_drop=0., drop_path=0.): super().__init__() self.block_num = block_num self.transformer_blocks = nn.ModuleList() for i in range(self.block_num): self.transformer_blocks.append(top_Block( embedding_dim, key_dim=key_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, attn_ratio=attn_ratio, drop=drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path)) self.conv = nn.Conv2d(embedding_dim, sum(ouc_list), 1) def forward(self, x): # token * N for i in range(self.block_num): x = self.transformer_blocks[i](x) return self.conv(x) class AdvPoolFusion(nn.Module): def forward(self, x): x1, x2 = x if torch.onnx.is_in_onnx_export(): self.pool = onnx_AdaptiveAvgPool2d else: self.pool = nn.functional.adaptive_avg_pool2d N, C, H, W = x2.shape output_size = np.array([H, W]) x1 = self.pool(x1, output_size) return torch.cat([x1, x2], 1) ================================================ FILE: yolo-improve/yolov5-GOLDYOLO/yolo.py ================================================ elif m is SimFusion_4in: c2 = sum(ch[x] for x in f) elif m is SimFusion_3in: c2 = args[0] if c2 != no: # if not output c2 = make_divisible(c2 * gw, 8) args = [[ch[f_] for f_ in f], c2] elif m is IFM: c1 = ch[f] c2 = sum(args[0]) args = [c1, *args] elif m is InjectionMultiSum_Auto_pool: c1 = ch[f[0]] c2 = args[0] args = [c1, *args] elif m is PyramidPoolAgg: c2 = args[0] args = [sum([ch[f_] for f_ in f]), *args] elif m is AdvPoolFusion: c2 = sum(ch[x] for x in f) elif m is TopBasicLayer: c2 = sum(args[1]) ================================================ FILE: yolo-improve/yolov5-GOLDYOLO/yolov5n-goldyolo.yaml ================================================ # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[[2, 4, 6, 9], 1, SimFusion_4in, []], # 10 [-1, 1, IFM, [[64, 32]]], # 11 [9, 1, Conv, [512, 1, 1]], # 12 [[4, 6, -1], 1, SimFusion_3in, [512]], # 13 [[-1, 11], 1, InjectionMultiSum_Auto_pool, [512, [64, 32], 0]], # 14 [-1, 3, C3, [512, False]], # 15 [6, 1, Conv, [256, 1, 1]], # 16 [[2, 4, -1], 1, SimFusion_3in, [256]], # 17 [[-1, 11], 1, InjectionMultiSum_Auto_pool, [256, [64, 32], 1]], # 18 [-1, 3, C3, [256, False]], # 19 [[19, 15, 9], 1, PyramidPoolAgg, [352, 2]], # 20 [-1, 1, TopBasicLayer, [352, [64, 128]]], # 21 [[19, 16], 1, AdvPoolFusion, []], # 22 [[-1, 21], 1, InjectionMultiSum_Auto_pool, [256, [64, 128], 0]], # 23 [-1, 3, C3, [256, False]], # 24 [[-1, 12], 1, AdvPoolFusion, []], # 25 [[-1, 21], 1, InjectionMultiSum_Auto_pool, [512, [64, 128], 1]], # 26 [-1, 3, C3, [512, False]], # 27 [[19, 24, 27], 1, Detect, [nc, anchors]] # 28 ] ================================================ FILE: yolo-improve/yolov5-GOLDYOLO/yolov7-goldyolo.yaml ================================================ # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [12,16, 19,36, 40,28] # P3/8 - [36,75, 76,55, 72,146] # P4/16 - [142,110, 192,243, 459,401] # P5/32 # yolov7 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 1, Yolov7_E_ELAN, [256, 64]], # 4 [-1, 1, V7DownSampling, [128]], # 5-P3/8 [-1, 1, Yolov7_E_ELAN, [512, 128]], # 6 [-1, 1, V7DownSampling, [256]], # 7-P4/16 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 8 [-1, 1, V7DownSampling, [512]], # 9-P5/32 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 10 ] # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 11-Yolov7-tiny-spp [[4, 6, 8, 11], 1, SimFusion_4in, []], # 12 [-1, 1, IFM, [[64, 32]]], # 13 [11, 1, Conv, [1024, 1, 1]], # 14 [[6, 8, -1], 1, SimFusion_3in, [256]], # 15 [[-1, 13], 1, InjectionMultiSum_Auto_pool, [256, [64, 32], 0]], # 16 [-1, 1, Yolov7_E_ELAN_NECK, [256, 128]], # 17 [8, 1, Conv, [128, 1, 1]], # 18 [[4, 6, -1], 1, SimFusion_3in, [128]], # 19 [[-1, 13], 1, InjectionMultiSum_Auto_pool, [128, [64, 32], 1]], # 20 [-1, 1, Yolov7_E_ELAN_NECK, [128, 64]], # 21 [[21, 17, 11], 1, PyramidPoolAgg, [352, 2]], # 22 [-1, 1, TopBasicLayer, [352, [64, 128]]], # 23 [[21, 18], 1, AdvPoolFusion, []], # 24 [[-1, 23], 1, InjectionMultiSum_Auto_pool, [256, [64, 128], 0]], # 25 [-1, 1, Yolov7_E_ELAN_NECK, [256, 128]], # 26 [[-1, 14], 1, AdvPoolFusion, []], # 27 [[-1, 23], 1, InjectionMultiSum_Auto_pool, [512, [64, 128], 1]], # 28 [-1, 1, Yolov7_E_ELAN_NECK, [512, 256]], # 29 [21, 1, RepConv, [256, 3, 1]], # 30-P3 [26, 1, RepConv, [512, 3, 1]], # 31-P4 [29, 1, RepConv, [1024, 3, 1]], # 32-P5 [[30, 31, 32], 1, IDetect, [nc, anchors]] # 33 ] ================================================ FILE: yolo-improve/yolov5-GOLDYOLO/yolov7-tiny-goldyolo.yaml ================================================ # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [[2, 4, 6, 9], 1, SimFusion_4in, []], # 10 [-1, 1, IFM, [[64, 32]]], # 11 [9, 1, Conv, [256, 1, 1]], # 12 [[4, 6, -1], 1, SimFusion_3in, [256]], # 13 [[-1, 11], 1, InjectionMultiSum_Auto_pool, [256, [64, 32], 0]], # 14 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 15 [6, 1, Conv, [128, 1, 1]], # 16 [[2, 4, -1], 1, SimFusion_3in, [128]], # 17 [[-1, 11], 1, InjectionMultiSum_Auto_pool, [128, [64, 32], 1]], # 18 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 19 [[19, 15, 9], 1, PyramidPoolAgg, [352, 2]], # 20 [-1, 1, TopBasicLayer, [352, [64, 128]]], # 21 [[19, 16], 1, AdvPoolFusion, []], # 22 [[-1, 21], 1, InjectionMultiSum_Auto_pool, [128, [64, 128], 0]], # 23 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 24 [[-1, 12], 1, AdvPoolFusion, []], # 25 [[-1, 21], 1, InjectionMultiSum_Auto_pool, [256, [64, 128], 1]], # 26 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 27 [19, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 28-P3 [24, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 29-P4 [27, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 30-P5 [[28, 29, 30], 1, IDetect, [nc, anchors]] # 28 ] ================================================ FILE: yolo-improve/yolov5-NWD.py ================================================ def wasserstein_loss(pred, target, eps=1e-7, constant=12.8): r"""`Implementation of paper `Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation `_. Code is modified from https://github.com/Zzh-tju/CIoU. Args: pred (Tensor): Predicted bboxes of format (x_center, y_center, w, h), shape (n, 4). target (Tensor): Corresponding gt bboxes, shape (n, 4). eps (float): Eps to avoid log(0). Return: Tensor: Loss tensor. """ center1 = pred[:, :2] center2 = target[:, :2] whs = center1[:, :2] - center2[:, :2] center_distance = whs[:, 0] * whs[:, 0] + whs[:, 1] * whs[:, 1] + eps # w1 = pred[:, 2] + eps h1 = pred[:, 3] + eps w2 = target[:, 2] + eps h2 = target[:, 3] + eps wh_distance = ((w1 - w2) ** 2 + (h1 - h2) ** 2) / 4 wasserstein_2 = center_distance + wh_distance return torch.exp(-torch.sqrt(wasserstein_2) / constant) nwd = wasserstein_loss(pbox, tbox[i]).squeeze() iou_ratio = 0.5 lbox += (1 - iou_ratio) * (1.0 - nwd).mean() + iou_ratio * (1.0 - iou).mean() # iou loss # Objectness iou = (iou.detach() * iou_ratio + nwd.detach() * (1 - iou_ratio)).clamp(0, 1).type(tobj.dtype) ================================================ FILE: yolo-improve/yolov5-OTA/loss.py ================================================ import torch.nn.functional as F from utils.metrics import box_iou from utils.torch_utils import de_parallel from utils.general import xywh2xyxy class ComputeLossOTA: # Compute losses def __init__(self, model, autobalance=False): super(ComputeLossOTA, self).__init__() device = next(model.parameters()).device # get model device h = model.hyp # hyperparameters # Define criteria BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device)) BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device)) # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3 self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0)) # positive, negative BCE targets # Focal loss g = h['fl_gamma'] # focal loss gamma if g > 0: BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g) det = de_parallel(model).model[-1] # Detect() module self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7 self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance for k in 'na', 'nc', 'nl', 'anchors', 'stride': setattr(self, k, getattr(det, k)) def __call__(self, p, targets, imgs): # predictions, targets, model device = targets.device lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device) bs, as_, gjs, gis, targets, anchors = self.build_targets(p, targets, imgs) pre_gen_gains = [torch.tensor(pp.shape, device=device)[[3, 2, 3, 2]] for pp in p] # Losses for i, pi in enumerate(p): # layer index, layer predictions b, a, gj, gi = bs[i], as_[i], gjs[i], gis[i] # image, anchor, gridy, gridx tobj = torch.zeros_like(pi[..., 0], device=device) # target obj n = b.shape[0] # number of targets if n: ps = pi[b, a, gj, gi] # prediction subset corresponding to targets # Regression grid = torch.stack([gi, gj], dim=1) pxy = ps[:, :2].sigmoid() * 2. - 0.5 #pxy = ps[:, :2].sigmoid() * 3. - 1. pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i] pbox = torch.cat((pxy, pwh), 1) # predicted box selected_tbox = targets[i][:, 2:6] * pre_gen_gains[i] selected_tbox[:, :2] -= grid iou = bbox_iou(pbox, selected_tbox, CIoU=True) # iou(prediction, target) if type(iou) is tuple: lbox += (iou[1].detach() * (1 - iou[0])).mean() iou = iou[0] else: lbox += (1.0 - iou).mean() # iou loss # Objectness tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * iou.detach().clamp(0).type(tobj.dtype) # iou ratio # Classification selected_tcls = targets[i][:, 1].long() if self.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], self.cn, device=device) # targets t[range(n), selected_tcls] = self.cp lcls += self.BCEcls(ps[:, 5:], t) # BCE # Append targets to text file # with open('targets.txt', 'a') as file: # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)] obji = self.BCEobj(pi[..., 4], tobj) lobj += obji * self.balance[i] # obj loss if self.autobalance: self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item() if self.autobalance: self.balance = [x / self.balance[self.ssi] for x in self.balance] lbox *= self.hyp['box'] lobj *= self.hyp['obj'] lcls *= self.hyp['cls'] bs = tobj.shape[0] # batch size loss = lbox + lobj + lcls return loss * bs, torch.cat((lbox, lobj, lcls)).detach() def build_targets(self, p, targets, imgs): indices, anch = self.find_3_positive(p, targets) device = torch.device(targets.device) matching_bs = [[] for pp in p] matching_as = [[] for pp in p] matching_gjs = [[] for pp in p] matching_gis = [[] for pp in p] matching_targets = [[] for pp in p] matching_anchs = [[] for pp in p] nl = len(p) for batch_idx in range(p[0].shape[0]): b_idx = targets[:, 0]==batch_idx this_target = targets[b_idx] if this_target.shape[0] == 0: continue txywh = this_target[:, 2:6] * imgs[batch_idx].shape[1] txyxy = xywh2xyxy(txywh) pxyxys = [] p_cls = [] p_obj = [] from_which_layer = [] all_b = [] all_a = [] all_gj = [] all_gi = [] all_anch = [] for i, pi in enumerate(p): b, a, gj, gi = indices[i] idx = (b == batch_idx) b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx] all_b.append(b) all_a.append(a) all_gj.append(gj) all_gi.append(gi) all_anch.append(anch[i][idx]) from_which_layer.append((torch.ones(size=(len(b),)) * i).to(device)) fg_pred = pi[b, a, gj, gi] p_obj.append(fg_pred[:, 4:5]) p_cls.append(fg_pred[:, 5:]) grid = torch.stack([gi, gj], dim=1) pxy = (fg_pred[:, :2].sigmoid() * 2. - 0.5 + grid) * self.stride[i] #/ 8. #pxy = (fg_pred[:, :2].sigmoid() * 3. - 1. + grid) * self.stride[i] pwh = (fg_pred[:, 2:4].sigmoid() * 2) ** 2 * anch[i][idx] * self.stride[i] #/ 8. pxywh = torch.cat([pxy, pwh], dim=-1) pxyxy = xywh2xyxy(pxywh) pxyxys.append(pxyxy) pxyxys = torch.cat(pxyxys, dim=0) if pxyxys.shape[0] == 0: continue p_obj = torch.cat(p_obj, dim=0) p_cls = torch.cat(p_cls, dim=0) from_which_layer = torch.cat(from_which_layer, dim=0) all_b = torch.cat(all_b, dim=0) all_a = torch.cat(all_a, dim=0) all_gj = torch.cat(all_gj, dim=0) all_gi = torch.cat(all_gi, dim=0) all_anch = torch.cat(all_anch, dim=0) pair_wise_iou = box_iou(txyxy, pxyxys) pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) top_k, _ = torch.topk(pair_wise_iou, min(10, pair_wise_iou.shape[1]), dim=1) dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) gt_cls_per_image = ( F.one_hot(this_target[:, 1].to(torch.int64), self.nc) .float() .unsqueeze(1) .repeat(1, pxyxys.shape[0], 1) ) num_gt = this_target.shape[0] cls_preds_ = ( p_cls.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() * p_obj.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() ) y = cls_preds_.sqrt_() pair_wise_cls_loss = F.binary_cross_entropy_with_logits( torch.log(y/(1-y)) , gt_cls_per_image, reduction="none" ).sum(-1) del cls_preds_ cost = ( pair_wise_cls_loss + 3.0 * pair_wise_iou_loss ) matching_matrix = torch.zeros_like(cost, device=device) for gt_idx in range(num_gt): _, pos_idx = torch.topk( cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False ) matching_matrix[gt_idx][pos_idx] = 1.0 del top_k, dynamic_ks anchor_matching_gt = matching_matrix.sum(0) if (anchor_matching_gt > 1).sum() > 0: _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0) matching_matrix[:, anchor_matching_gt > 1] *= 0.0 matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 fg_mask_inboxes = (matching_matrix.sum(0) > 0.0).to(device) matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) from_which_layer = from_which_layer[fg_mask_inboxes] all_b = all_b[fg_mask_inboxes] all_a = all_a[fg_mask_inboxes] all_gj = all_gj[fg_mask_inboxes] all_gi = all_gi[fg_mask_inboxes] all_anch = all_anch[fg_mask_inboxes] this_target = this_target[matched_gt_inds] for i in range(nl): layer_idx = from_which_layer == i matching_bs[i].append(all_b[layer_idx]) matching_as[i].append(all_a[layer_idx]) matching_gjs[i].append(all_gj[layer_idx]) matching_gis[i].append(all_gi[layer_idx]) matching_targets[i].append(this_target[layer_idx]) matching_anchs[i].append(all_anch[layer_idx]) for i in range(nl): if matching_targets[i] != []: matching_bs[i] = torch.cat(matching_bs[i], dim=0) matching_as[i] = torch.cat(matching_as[i], dim=0) matching_gjs[i] = torch.cat(matching_gjs[i], dim=0) matching_gis[i] = torch.cat(matching_gis[i], dim=0) matching_targets[i] = torch.cat(matching_targets[i], dim=0) matching_anchs[i] = torch.cat(matching_anchs[i], dim=0) else: matching_bs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_as[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_gjs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_gis[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_targets[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) matching_anchs[i] = torch.tensor([], device='cuda:0', dtype=torch.int64) return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs def find_3_positive(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets indices, anch = [], [] gain = torch.ones(7, device=targets.device).long() # normalized to gridspace gain ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt) targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2) # append anchor indices g = 0.5 # bias off = torch.tensor([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm ], device=targets.device).float() * g # offsets for i in range(self.nl): anchors = self.anchors[i] gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain # Match targets to anchors t = targets * gain if nt: # Matches r = t[:, :, 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1. / r).max(2)[0] < self.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1. < g) & (gxy > 1.)).T l, m = ((gxi % 1. < g) & (gxi > 1.)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Define b, c = t[:, :2].long().T # image, class gxy = t[:, 2:4] # grid xy gwh = t[:, 4:6] # grid wh gij = (gxy - offsets).long() gi, gj = gij.T # grid xy indices # Append a = t[:, 6].long() # anchor indices indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices anch.append(anchors[a]) # anchors return indices, anch ================================================ FILE: yolo-improve/yolov5-RepNCSPELAN.py ================================================ class RepConvN(nn.Module): """RepConv is a basic rep-style block, including training and deploy status This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py """ default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): super().__init__() assert k == 3 and p == 1 self.g = g self.c1 = c1 self.c2 = c2 self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() self.bn = None self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False) self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) def forward_fuse(self, x): """Forward process""" return self.act(self.conv(x)) def forward(self, x): """Forward process""" id_out = 0 if self.bn is None else self.bn(x) return self.act(self.conv1(x) + self.conv2(x) + id_out) def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) kernelid, biasid = self._fuse_bn_tensor(self.bn) return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _avg_to_3x3_tensor(self, avgp): channels = self.c1 groups = self.g kernel_size = avgp.kernel_size input_dim = channels // groups k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 return k def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 if isinstance(branch, Conv): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps elif isinstance(branch, nn.BatchNorm2d): if not hasattr(self, 'id_tensor'): input_dim = self.c1 // self.g kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32) for i in range(self.c1): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def fuse_convs(self): if hasattr(self, 'conv'): return kernel, bias = self.get_equivalent_kernel_bias() self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels, out_channels=self.conv1.conv.out_channels, kernel_size=self.conv1.conv.kernel_size, stride=self.conv1.conv.stride, padding=self.conv1.conv.padding, dilation=self.conv1.conv.dilation, groups=self.conv1.conv.groups, bias=True).requires_grad_(False) self.conv.weight.data = kernel self.conv.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('conv1') self.__delattr__('conv2') if hasattr(self, 'nm'): self.__delattr__('nm') if hasattr(self, 'bn'): self.__delattr__('bn') if hasattr(self, 'id_tensor'): self.__delattr__('id_tensor') class RepNBottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, kernels, groups, expand super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = RepConvN(c1, c_, k[0], 1) self.cv2 = Conv(c_, c2, k[1], 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class RepNCSP(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) self.m = nn.Sequential(*(RepNBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) class RepNCSPELAN4(nn.Module): # csp-elan def __init__(self, c1, c2, c3, c4, c5=1): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() self.c = c3//2 self.cv1 = Conv(c1, c3, 1, 1) self.cv2 = nn.Sequential(RepNCSP(c3//2, c4, c5), Conv(c4, c4, 3, 1)) self.cv3 = nn.Sequential(RepNCSP(c4, c4, c5), Conv(c4, c4, 3, 1)) self.cv4 = Conv(c3+(2*c4), c2, 1, 1) def forward(self, x): y = list(self.cv1(x).chunk(2, 1)) y.extend((m(y[-1])) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) def forward_split(self, x): y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) # ------------------------------------yolo.py------------------------------------ if m in (RepNCSPELAN4,): args[2] = make_divisible(args[2] * gw, ch_mul) args[3] = make_divisible(args[3] * gw, ch_mul) if hasattr(m, 'fuse_convs'): m.fuse_convs() m.forward = m.forward_fuse # ------------------------------------yaml------------------------------------ # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10, 13, 16, 30, 33, 23] # P3/8 - [30, 61, 62, 45, 59, 119] # P4/16 - [116, 90, 156, 198, 373, 326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [ [-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 1, RepNCSPELAN4, [128, 64, 32, 1]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 1, RepNCSPELAN4, [1024, 512, 256, 1]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [ [-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, "nearest"]], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, "nearest"]], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 1, RepNCSPELAN4, [1024, 512, 256, 1]], # 23 (P5/32-large) [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-SAConv.py ================================================ class ConvAWS2d(nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super().__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.register_buffer('weight_gamma', torch.ones(self.out_channels, 1, 1, 1)) self.register_buffer('weight_beta', torch.zeros(self.out_channels, 1, 1, 1)) def _get_weight(self, weight): weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) weight = weight - weight_mean std = torch.sqrt(weight.view(weight.size(0), -1).var(dim=1) + 1e-5).view(-1, 1, 1, 1) weight = weight / std weight = self.weight_gamma * weight + self.weight_beta return weight def forward(self, x): weight = self._get_weight(self.weight) return super()._conv_forward(x, weight, None) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): self.weight_gamma.data.fill_(-1) super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) if self.weight_gamma.data.mean() > 0: return weight = self.weight.data weight_mean = weight.data.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) self.weight_beta.data.copy_(weight_mean) std = torch.sqrt(weight.view(weight.size(0), -1).var(dim=1) + 1e-5).view(-1, 1, 1, 1) self.weight_gamma.data.copy_(std) class SAConv2d(ConvAWS2d): def __init__(self, in_channels, out_channels, kernel_size, s=1, p=None, g=1, d=1, act=True, bias=True): super().__init__( in_channels, out_channels, kernel_size, stride=s, padding=autopad(kernel_size, p, d), dilation=d, groups=g, bias=bias) self.switch = torch.nn.Conv2d( self.in_channels, 1, kernel_size=1, stride=s, bias=True) self.switch.weight.data.fill_(0) self.switch.bias.data.fill_(1) self.weight_diff = torch.nn.Parameter(torch.Tensor(self.weight.size())) self.weight_diff.data.zero_() self.pre_context = torch.nn.Conv2d( self.in_channels, self.in_channels, kernel_size=1, bias=True) self.pre_context.weight.data.fill_(0) self.pre_context.bias.data.fill_(0) self.post_context = torch.nn.Conv2d( self.out_channels, self.out_channels, kernel_size=1, bias=True) self.post_context.weight.data.fill_(0) self.post_context.bias.data.fill_(0) self.bn = nn.BatchNorm2d(out_channels) self.act = Conv.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): # pre-context avg_x = torch.nn.functional.adaptive_avg_pool2d(x, output_size=1) avg_x = self.pre_context(avg_x) avg_x = avg_x.expand_as(x) x = x + avg_x # switch avg_x = torch.nn.functional.pad(x, pad=(2, 2, 2, 2), mode="reflect") avg_x = torch.nn.functional.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0) switch = self.switch(avg_x) # sac weight = self._get_weight(self.weight) out_s = super()._conv_forward(x, weight, None) ori_p = self.padding ori_d = self.dilation self.padding = tuple(3 * p for p in self.padding) self.dilation = tuple(3 * d for d in self.dilation) weight = weight + self.weight_diff out_l = super()._conv_forward(x, weight, None) out = switch * out_s + (1 - switch) * out_l self.padding = ori_p self.dilation = ori_d # post-context avg_x = torch.nn.functional.adaptive_avg_pool2d(out, output_size=1) avg_x = self.post_context(avg_x) avg_x = avg_x.expand_as(out) out = out + avg_x return self.act(self.bn(out)) class Bottleneck_SAC(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = SAConv2d(c_, c2, 3, 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3_SAC(C3): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) # hidden channels self.m = nn.Sequential(*(Bottleneck_SAC(c_, c_, shortcut, g, e=1.0) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-TSCODE.py ================================================ from einops import rearrange class TSCODE_Detect(nn.Module): # YOLOv5 Detect head for detection models stride = None # strides computed during build dynamic = False # force grid reconstruction export = False # export mode def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer super().__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.empty(0) for _ in range(self.nl)] # init grid self.anchor_grid = [torch.empty(0) for _ in range(self.nl)] # init anchor grid self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2) self.m_sce = nn.ModuleList(SCE(ch[id:id+2]) for id in range(1, len(ch) - 1)) self.m_dpe = nn.ModuleList(DPE(ch[id-1:id+2], ch[id]) for id in range(1, len(ch) - 1)) self.m_cls = nn.ModuleList(nn.Sequential(Conv(sum(ch[id:id+2]), ch[id], 1), Conv(ch[id], ch[id], 3), nn.Conv2d(ch[id], self.na * self.nc * 4, 1)) for id in range(1, len(ch) - 1)) # cls conv self.m_reg_conf = nn.ModuleList(nn.Sequential(*[Conv(ch[id], ch[id], 3) for i in range(2)]) for id in range(1, len(ch) - 1)) # reg_conf stem conv self.m_reg = nn.ModuleList(nn.Conv2d(ch[id], self.na * 4, 1) for id in range(1, len(ch) - 1)) # reg conv self.m_conf = nn.ModuleList(nn.Conv2d(ch[id], self.na * 1, 1) for id in range(1, len(ch) - 1)) # conf conv self.ph, self.pw = 2, 2 self.inplace = inplace # use inplace ops (e.g. slice assignment) def forward(self, x_): x, z = [], [] # inference output for i, idx in enumerate(range(1, self.nl + 1)): bs, _, ny, nx = x_[idx].shape x_sce, x_dpe = self.m_sce[i](x_[idx:idx+2]), self.m_dpe[i](x_[idx-1:idx+2]) x_cls = rearrange(self.m_cls[i](x_sce), 'bs (nl ph pw nc) h w -> bs nl nc (h ph) (w pw)', nl=self.nl, ph=self.ph, pw=self.pw, nc=self.nc) x_cls = x_cls.permute(0, 1, 3, 4, 2).contiguous() x_reg_conf = self.m_reg_conf[i](x_dpe) x_reg = self.m_reg[i](x_reg_conf).view(bs, self.na, 4, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_conf = self.m_conf[i](x_reg_conf).view(bs, self.na, 1, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x.append(torch.cat([x_reg, x_conf, x_cls], dim=4)) if not self.training: # inference if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i) if isinstance(self, Segment): # (boxes + masks) xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4) xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf.sigmoid(), mask), 4) else: # Detect (boxes only) xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4) xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, self.na * nx * ny, self.no)) return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x) def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')): d = self.anchors[i].device t = self.anchors[i].dtype shape = 1, self.na, ny, nx, 2 # grid shape y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t) yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x) # torch>=0.7 compatibility grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5 anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape) return grid, anchor_grid class Decoupled_Detect(nn.Module): # YOLOv5 Detect head for detection models stride = None # strides computed during build dynamic = False # force grid reconstruction export = False # export mode def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer super().__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.empty(0) for _ in range(self.nl)] # init grid self.anchor_grid = [torch.empty(0) for _ in range(self.nl)] # init anchor grid self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2) self.m_stem = nn.ModuleList(Conv(x, x, 1) for x in ch) # stem conv self.m_cls = nn.ModuleList(nn.Sequential(Conv(x, x, 3), nn.Conv2d(x, self.na * self.nc, 1)) for x in ch) # cls conv self.m_reg_conf = nn.ModuleList(Conv(x, x, 3) for x in ch) # reg_conf stem conv self.m_reg = nn.ModuleList(nn.Conv2d(x, self.na * 4, 1) for x in ch) # reg conv self.m_conf = nn.ModuleList(nn.Conv2d(x, self.na * 1, 1) for x in ch) # conf conv self.inplace = inplace # use inplace ops (e.g. slice assignment) def forward(self, x): z = [] # inference output for i in range(self.nl): x[i] = self.m_stem[i](x[i]) # conv bs, _, ny, nx = x[i].shape x_cls = self.m_cls[i](x[i]).view(bs, self.na, self.nc, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_reg_conf = self.m_reg_conf[i](x[i]) x_reg = self.m_reg[i](x_reg_conf).view(bs, self.na, 4, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_conf = self.m_conf[i](x_reg_conf).view(bs, self.na, 1, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i] = torch.cat([x_reg, x_conf, x_cls], dim=4) if not self.training: # inference if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i) if isinstance(self, Segment): # (boxes + masks) xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4) xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf.sigmoid(), mask), 4) else: # Detect (boxes only) xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4) xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, self.na * nx * ny, self.no)) return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x) def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')): d = self.anchors[i].device t = self.anchors[i].dtype shape = 1, self.na, ny, nx, 2 # grid shape y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t) yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x) # torch>=0.7 compatibility grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5 anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape) return grid, anchor_grid def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency # https://arxiv.org/abs/1708.02002 section 3.3 # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1. m = self.model[-1] # Detect() module if isinstance(m, Detect): for mi, s in zip(m.m, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) b.data[:, 5:5 + m.nc] += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) elif isinstance(m, Decoupled_Detect) or isinstance(m, TSCODE_Detect): for mi, s in zip(m.m_conf, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) for mi, s in zip(m.m_cls, m.stride): # from b = mi[-1].bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls mi[-1].bias = torch.nn.Parameter(b.view(-1), requires_grad=True) ### Task-Specific Context Decoupling for Object Detection class SCE(nn.Module): def __init__(self, c1): super().__init__() self.down = Conv(c1[0], c1[0], k=3, s=2) def forward(self, x): x_p1, x_p2 = x x = torch.concat([self.down(x_p1), x_p2], dim=1) return x class DPE(nn.Module): def __init__(self, c1, c2): super().__init__() self.adjust_channel_forp1 = Conv(c1[0], c2, k=1) self.adjust_channel_forp2 = Conv(c1[1], c2, k=1) self.up_forp2 = nn.Sequential( nn.Upsample(scale_factor=2), Conv(c2, c2, k=1) ) self.up_forp3 = nn.Sequential( nn.Upsample(scale_factor=2), Conv(c1[2], c2, k=1) ) self.down = Conv(c2, c2, k=3, s=2) self.middle = Conv(c2, c2, k=1) def forward(self, x): x_p2 = self.adjust_channel_forp2(x[1]) x_p1 = self.adjust_channel_forp1(x[0]) + self.up_forp2(x_p2) x_p1 = self.down(x_p1) x_p3 = self.up_forp3(x[2]) return x_p1 + x_p2 + x_p3 #### yolov5-FPN-TSCODE # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], # 10 [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 11 [[-1, 6], 1, Concat, [1]], # cat backbone P4 12 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], # 14 [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 15 [[-1, 4], 1, Concat, [1]], # cat backbone P3 16 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [9, 1, Conv, [1024, 3, 2]], # 18-P6/64 [-1, 3, C3, [1024]], # 19 [[2, 17, 13, 10, 19], 1, TSCODE_Detect, [nc, anchors]], # Detect(P3, P4, P5) ] #### yolov5-PFPN-TSCODE # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], # 10 [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 11 [[-1, 6], 1, Concat, [1]], # cat backbone P4 12 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], # 14 [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 15 [[-1, 4], 1, Concat, [1]], # cat backbone P3 16 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], # 18 [[-1, 14], 1, Concat, [1]], # cat head P4 19 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], # 21 [[-1, 10], 1, Concat, [1]], # cat head P5 # 22 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [9, 1, Conv, [1024, 3, 2]], # 24-P6/64 [-1, 3, C3, [1024]], # 25 [[2, 17, 20, 23, 25], 1, TSCODE_Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-aLRPLoss.py ================================================ class aLRPLoss(torch.autograd.Function): @staticmethod def forward(ctx, logits, targets, regression_losses, delta=1., eps=1e-5): classification_grads=torch.zeros(logits.shape).cuda() #Filter fg logits fg_labels = (targets == 1) fg_logits = logits[fg_labels] fg_num = len(fg_logits) #Do not use bg with scores less than minimum fg logit #since changing its score does not have an effect on precision threshold_logit = torch.min(fg_logits)-delta #Get valid bg logits relevant_bg_labels=((targets==0)&(logits>=threshold_logit)) relevant_bg_logits=logits[relevant_bg_labels] relevant_bg_grad=torch.zeros(len(relevant_bg_logits)).cuda() rank=torch.zeros(fg_num).cuda() prec=torch.zeros(fg_num).cuda() fg_grad=torch.zeros(fg_num).cuda() max_prec=0 #sort the fg logits order=torch.argsort(fg_logits) #Loops over each positive following the order for ii in order: #x_ij s as score differences with fgs fg_relations=fg_logits-fg_logits[ii] #Apply piecewise linear function and determine relations with fgs fg_relations=torch.clamp(fg_relations/(2*delta)+0.5,min=0,max=1) #Discard i=j in the summation in rank_pos fg_relations[ii]=0 #x_ij s as score differences with bgs bg_relations=relevant_bg_logits-fg_logits[ii] #Apply piecewise linear function and determine relations with bgs bg_relations=torch.clamp(bg_relations/(2*delta)+0.5,min=0,max=1) #Compute the rank of the example within fgs and number of bgs with larger scores rank_pos=1+torch.sum(fg_relations) FP_num=torch.sum(bg_relations) #Store the total since it is normalizer also for aLRP Regression error rank[ii]=rank_pos+FP_num #Compute precision for this example to compute classification loss prec[ii]=rank_pos/rank[ii] #For stability, set eps to a infinitesmall value (e.g. 1e-6), then compute grads if FP_num > eps: fg_grad[ii] = -(torch.sum(fg_relations*regression_losses)+FP_num)/rank[ii] relevant_bg_grad += (bg_relations*(-fg_grad[ii]/FP_num)) #aLRP with grad formulation fg gradient classification_grads[fg_labels]= fg_grad #aLRP with grad formulation bg gradient classification_grads[relevant_bg_labels]= relevant_bg_grad classification_grads /= (fg_num) cls_loss=1-prec.mean() ctx.save_for_backward(classification_grads) return cls_loss, rank, order @staticmethod def backward(ctx, out_grad1, out_grad2, out_grad3): g1, =ctx.saved_tensors return g1*out_grad1, None, None, None, None # init self.aLRP_Loss = aLRPLoss() self.SB_weight = 50 self.period = 3665 self.cls_LRP_hist = collections.deque(maxlen=self.period) self.reg_LRP_hist = collections.deque(maxlen=self.period) self.counter = 0 # __call__ def __call__(self, p, targets): # predictions, targets lcls = torch.zeros(1, device=self.device) # class loss lbox = torch.zeros(1, device=self.device) # box loss lobj = torch.zeros(1, device=self.device) # object loss tcls, tbox, indices, anchors = self.build_targets(p, targets) # targets # Losses for i, pi in enumerate(p): # layer index, layer predictions b, a, gj, gi = indices[i] # image, anchor, gridy, gridx tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device) # target obj n = b.shape[0] # number of targets if n: # pxy, pwh, _, pcls = pi[b, a, gj, gi].tensor_split((2, 4, 5), dim=1) # faster, requires torch 1.8.0 pxy, pwh, _, pcls = pi[b, a, gj, gi].split((2, 2, 1, self.nc), 1) # target-subset of predictions # Regression pxy = pxy.sigmoid() * 2 - 0.5 pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i] pbox = torch.cat((pxy, pwh), 1) # predicted box iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze() # iou(prediction, target) # Classification if self.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(pcls, self.cn, device=self.device) # targets t[range(n), tcls[i]] = self.cp # lcls += self.BCEcls(pcls, t) # BCE lbox_temp = 1.0 - iou losses_cls, rank, order = self.aLRP_Loss.apply(pcls.reshape(-1), t.reshape(-1), lbox_temp.detach()) ordered_losses_bbox = lbox_temp[order.detach()].flip(dims=[0]) losses_bbox = (torch.cumsum(ordered_losses_bbox,dim=0)/rank[order.detach()].detach().flip(dims=[0])).mean() self.cls_LRP_hist.append(float(losses_cls.item())) self.reg_LRP_hist.append(float(losses_bbox.item())) self.counter += 1 if self.counter == self.period: self.SB_weight = (np.mean(self.reg_LRP_hist)+np.mean(self.cls_LRP_hist))/np.mean(self.reg_LRP_hist) self.cls_LRP_hist.clear() self.reg_LRP_hist.clear() self.counter=0 lbox += losses_bbox * self.SB_weight # iou loss lcls += losses_cls # Objectness iou = iou.detach().clamp(0).type(tobj.dtype) if self.sort_obj_iou: j = iou.argsort() b, a, gj, gi, iou = b[j], a[j], gj[j], gi[j], iou[j] if self.gr < 1: iou = (1.0 - self.gr) + self.gr * iou tobj[b, a, gj, gi] = iou # iou ratio # Append targets to text file # with open('targets.txt', 'a') as file: # [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)] obji = self.BCEobj(pi[..., 4], tobj) lobj += obji * self.balance[i] # obj loss if self.autobalance: self.balance[i] = self.balance[i] * 0.9999 + 0.0001 / obji.detach().item() if self.autobalance: self.balance = [x / self.balance[self.ssi] for x in self.balance] lbox *= self.hyp['box'] lobj *= self.hyp['obj'] lcls *= self.hyp['cls'] bs = tobj.shape[0] # batch size return (lbox + lobj + lcls) * bs, torch.cat((lbox, lobj, lcls)).detach() ================================================ FILE: yolo-improve/yolov5-asf.py ================================================ # common.py import torch.nn.functional as F class Zoom_cat(nn.Module): def __init__(self): super().__init__() def forward(self, x): """l,m,s表示大中小三个尺度,最终会被整合到m这个尺度上""" l, m, s = x[0], x[1], x[2] tgt_size = m.shape[2:] l = F.adaptive_max_pool2d(l, tgt_size) + F.adaptive_avg_pool2d(l, tgt_size) s = F.interpolate(s, m.shape[2:], mode='nearest') lms = torch.cat([l, m, s], dim=1) return lms class ScalSeq(nn.Module): def __init__(self, inc, channel): super(ScalSeq, self).__init__() self.conv1 = Conv(inc[1], channel,1) self.conv2 = Conv(inc[2], channel,1) self.conv3d = nn.Conv3d(channel,channel,kernel_size=(1,1,1)) self.bn = nn.BatchNorm3d(channel) self.act = nn.LeakyReLU(0.1) self.pool_3d = nn.MaxPool3d(kernel_size=(3,1,1)) def forward(self, x): p3, p4, p5 = x[0],x[1],x[2] p4_2 = self.conv1(p4) p4_2 = F.interpolate(p4_2, p3.size()[2:], mode='nearest') p5_2 = self.conv2(p5) p5_2 = F.interpolate(p5_2, p3.size()[2:], mode='nearest') p3_3d = torch.unsqueeze(p3, -3) p4_3d = torch.unsqueeze(p4_2, -3) p5_3d = torch.unsqueeze(p5_2, -3) combine = torch.cat([p3_3d,p4_3d,p5_3d],dim = 2) conv_3d = self.conv3d(combine) bn = self.bn(conv_3d) act = self.act(bn) x = self.pool_3d(act) x = torch.squeeze(x, 2) return x class Add(nn.Module): # Concatenate a list of tensors along dimension def __init__(self): super().__init__() def forward(self, x): input1,input2 = x[0],x[1] x = input1 + input2 return x class channel_att(nn.Module): def __init__(self, channel, b=1, gamma=2): super(channel_att, self).__init__() kernel_size = int(abs((math.log(channel, 2) + b) / gamma)) kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1 self.avg_pool = nn.AdaptiveAvgPool2d(1) self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False) self.sigmoid = nn.Sigmoid() def forward(self, x): y = self.avg_pool(x) y = y.squeeze(-1) y = y.transpose(-1, -2) y = self.conv(y).transpose(-1, -2).unsqueeze(-1) y = self.sigmoid(y) return x * y.expand_as(x) class local_att(nn.Module): def __init__(self, channel, reduction=16): super(local_att, self).__init__() self.conv_1x1 = nn.Conv2d(in_channels=channel, out_channels=channel//reduction, kernel_size=1, stride=1, bias=False) self.relu = nn.ReLU() self.bn = nn.BatchNorm2d(channel//reduction) self.F_h = nn.Conv2d(in_channels=channel//reduction, out_channels=channel, kernel_size=1, stride=1, bias=False) self.F_w = nn.Conv2d(in_channels=channel//reduction, out_channels=channel, kernel_size=1, stride=1, bias=False) self.sigmoid_h = nn.Sigmoid() self.sigmoid_w = nn.Sigmoid() def forward(self, x): _, _, h, w = x.size() x_h = torch.mean(x, dim = 3, keepdim = True).permute(0, 1, 3, 2) x_w = torch.mean(x, dim = 2, keepdim = True) x_cat_conv_relu = self.relu(self.bn(self.conv_1x1(torch.cat((x_h, x_w), 3)))) x_cat_conv_split_h, x_cat_conv_split_w = x_cat_conv_relu.split([h, w], 3) s_h = self.sigmoid_h(self.F_h(x_cat_conv_split_h.permute(0, 1, 3, 2))) s_w = self.sigmoid_w(self.F_w(x_cat_conv_split_w)) out = x * s_h.expand_as(x) * s_w.expand_as(x) return out class attention_model(nn.Module): # Concatenate a list of tensors along dimension def __init__(self, ch = 256): super().__init__() self.channel_att = channel_att(ch) self.local_att = local_att(ch) def forward(self, x): input1,input2 = x[0],x[1] input1 = self.channel_att(input1) x = input1 + input2 x = self.local_att(x) return x # yolo.py elif m is Zoom_cat: c2 = sum(ch[x] for x in f) elif m is Add: c2 = ch[f[-1]] elif m is attention_model: c2 = ch[f[-1]] args = [c2] elif m is ScalSeq: c1 = [ch[x] for x in f] c2 = make_divisible(args[0] * gw, 8) args = [c1, c2] # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], #10 [4, 1, Conv, [512, 1, 1]], #11 [[-1, 6, -2], 1, Zoom_cat, []], # 12 cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], #14 [2, 1, Conv, [256, 1, 1]], #15 [[-1, 4, -2], 1, Zoom_cat, []], #16 cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], #18 [[-1, 14], 1, Concat, [1]], #19 cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], #21 [[-1, 10], 1, Concat, [1]], #22 cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[4, 6, 8], 1, ScalSeq, [256]], #24 args[inchane] [[17, -1], 1, attention_model, []], #25 [[25, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-backbone/CVPR2023-EfficientViT/EfficientViT.py ================================================ # -------------------------------------------------------- # EfficientViT Model Architecture for Downstream Tasks # Copyright (c) 2022 Microsoft # Written by: Xinyu Liu # -------------------------------------------------------- import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint import itertools from timm.models.layers import SqueezeExcite import numpy as np import itertools __all__ = ['EfficientViT_M0', 'EfficientViT_M1', 'EfficientViT_M2', 'EfficientViT_M3', 'EfficientViT_M4', 'EfficientViT_M5'] class Conv2d_BN(torch.nn.Sequential): def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1, resolution=-10000): super().__init__() self.add_module('c', torch.nn.Conv2d( a, b, ks, stride, pad, dilation, groups, bias=False)) self.add_module('bn', torch.nn.BatchNorm2d(b)) torch.nn.init.constant_(self.bn.weight, bn_weight_init) torch.nn.init.constant_(self.bn.bias, 0) @torch.no_grad() def fuse(self): c, bn = self._modules.values() w = bn.weight / (bn.running_var + bn.eps)**0.5 w = c.weight * w[:, None, None, None] b = bn.bias - bn.running_mean * bn.weight / \ (bn.running_var + bn.eps)**0.5 m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size( 0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups) m.weight.data.copy_(w) m.bias.data.copy_(b) return m def replace_batchnorm(net): for child_name, child in net.named_children(): if hasattr(child, 'fuse'): setattr(net, child_name, child.fuse()) elif isinstance(child, torch.nn.BatchNorm2d): setattr(net, child_name, torch.nn.Identity()) else: replace_batchnorm(child) class PatchMerging(torch.nn.Module): def __init__(self, dim, out_dim, input_resolution): super().__init__() hid_dim = int(dim * 4) self.conv1 = Conv2d_BN(dim, hid_dim, 1, 1, 0, resolution=input_resolution) self.act = torch.nn.ReLU() self.conv2 = Conv2d_BN(hid_dim, hid_dim, 3, 2, 1, groups=hid_dim, resolution=input_resolution) self.se = SqueezeExcite(hid_dim, .25) self.conv3 = Conv2d_BN(hid_dim, out_dim, 1, 1, 0, resolution=input_resolution // 2) def forward(self, x): x = self.conv3(self.se(self.act(self.conv2(self.act(self.conv1(x)))))) return x class Residual(torch.nn.Module): def __init__(self, m, drop=0.): super().__init__() self.m = m self.drop = drop def forward(self, x): if self.training and self.drop > 0: return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1, device=x.device).ge_(self.drop).div(1 - self.drop).detach() else: return x + self.m(x) class FFN(torch.nn.Module): def __init__(self, ed, h, resolution): super().__init__() self.pw1 = Conv2d_BN(ed, h, resolution=resolution) self.act = torch.nn.ReLU() self.pw2 = Conv2d_BN(h, ed, bn_weight_init=0, resolution=resolution) def forward(self, x): x = self.pw2(self.act(self.pw1(x))) return x class CascadedGroupAttention(torch.nn.Module): r""" Cascaded Group Attention. Args: dim (int): Number of input channels. key_dim (int): The dimension for query and key. num_heads (int): Number of attention heads. attn_ratio (int): Multiplier for the query dim for value dimension. resolution (int): Input resolution, correspond to the window size. kernels (List[int]): The kernel size of the dw conv on query. """ def __init__(self, dim, key_dim, num_heads=8, attn_ratio=4, resolution=14, kernels=[5, 5, 5, 5],): super().__init__() self.num_heads = num_heads self.scale = key_dim ** -0.5 self.key_dim = key_dim self.d = int(attn_ratio * key_dim) self.attn_ratio = attn_ratio qkvs = [] dws = [] for i in range(num_heads): qkvs.append(Conv2d_BN(dim // (num_heads), self.key_dim * 2 + self.d, resolution=resolution)) dws.append(Conv2d_BN(self.key_dim, self.key_dim, kernels[i], 1, kernels[i]//2, groups=self.key_dim, resolution=resolution)) self.qkvs = torch.nn.ModuleList(qkvs) self.dws = torch.nn.ModuleList(dws) self.proj = torch.nn.Sequential(torch.nn.ReLU(), Conv2d_BN( self.d * num_heads, dim, bn_weight_init=0, resolution=resolution)) points = list(itertools.product(range(resolution), range(resolution))) N = len(points) attention_offsets = {} idxs = [] for p1 in points: for p2 in points: offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) if offset not in attention_offsets: attention_offsets[offset] = len(attention_offsets) idxs.append(attention_offsets[offset]) self.attention_biases = torch.nn.Parameter( torch.zeros(num_heads, len(attention_offsets))) self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N, N)) @torch.no_grad() def train(self, mode=True): super().train(mode) if mode and hasattr(self, 'ab'): del self.ab else: self.ab = self.attention_biases[:, self.attention_bias_idxs] def forward(self, x): # x (B,C,H,W) B, C, H, W = x.shape trainingab = self.attention_biases[:, self.attention_bias_idxs] feats_in = x.chunk(len(self.qkvs), dim=1) feats_out = [] feat = feats_in[0] for i, qkv in enumerate(self.qkvs): if i > 0: # add the previous output to the input feat = feat + feats_in[i] feat = qkv(feat) q, k, v = feat.view(B, -1, H, W).split([self.key_dim, self.key_dim, self.d], dim=1) # B, C/h, H, W q = self.dws[i](q) q, k, v = q.flatten(2), k.flatten(2), v.flatten(2) # B, C/h, N attn = ( (q.transpose(-2, -1) @ k) * self.scale + (trainingab[i] if self.training else self.ab[i]) ) attn = attn.softmax(dim=-1) # BNN feat = (v @ attn.transpose(-2, -1)).view(B, self.d, H, W) # BCHW feats_out.append(feat) x = self.proj(torch.cat(feats_out, 1)) return x class LocalWindowAttention(torch.nn.Module): r""" Local Window Attention. Args: dim (int): Number of input channels. key_dim (int): The dimension for query and key. num_heads (int): Number of attention heads. attn_ratio (int): Multiplier for the query dim for value dimension. resolution (int): Input resolution. window_resolution (int): Local window resolution. kernels (List[int]): The kernel size of the dw conv on query. """ def __init__(self, dim, key_dim, num_heads=8, attn_ratio=4, resolution=14, window_resolution=7, kernels=[5, 5, 5, 5],): super().__init__() self.dim = dim self.num_heads = num_heads self.resolution = resolution assert window_resolution > 0, 'window_size must be greater than 0' self.window_resolution = window_resolution self.attn = CascadedGroupAttention(dim, key_dim, num_heads, attn_ratio=attn_ratio, resolution=window_resolution, kernels=kernels,) def forward(self, x): B, C, H, W = x.shape if H <= self.window_resolution and W <= self.window_resolution: x = self.attn(x) else: x = x.permute(0, 2, 3, 1) pad_b = (self.window_resolution - H % self.window_resolution) % self.window_resolution pad_r = (self.window_resolution - W % self.window_resolution) % self.window_resolution padding = pad_b > 0 or pad_r > 0 if padding: x = torch.nn.functional.pad(x, (0, 0, 0, pad_r, 0, pad_b)) pH, pW = H + pad_b, W + pad_r nH = pH // self.window_resolution nW = pW // self.window_resolution # window partition, BHWC -> B(nHh)(nWw)C -> BnHnWhwC -> (BnHnW)hwC -> (BnHnW)Chw x = x.view(B, nH, self.window_resolution, nW, self.window_resolution, C).transpose(2, 3).reshape( B * nH * nW, self.window_resolution, self.window_resolution, C ).permute(0, 3, 1, 2) x = self.attn(x) # window reverse, (BnHnW)Chw -> (BnHnW)hwC -> BnHnWhwC -> B(nHh)(nWw)C -> BHWC x = x.permute(0, 2, 3, 1).view(B, nH, nW, self.window_resolution, self.window_resolution, C).transpose(2, 3).reshape(B, pH, pW, C) if padding: x = x[:, :H, :W].contiguous() x = x.permute(0, 3, 1, 2) return x class EfficientViTBlock(torch.nn.Module): """ A basic EfficientViT building block. Args: type (str): Type for token mixer. Default: 's' for self-attention. ed (int): Number of input channels. kd (int): Dimension for query and key in the token mixer. nh (int): Number of attention heads. ar (int): Multiplier for the query dim for value dimension. resolution (int): Input resolution. window_resolution (int): Local window resolution. kernels (List[int]): The kernel size of the dw conv on query. """ def __init__(self, type, ed, kd, nh=8, ar=4, resolution=14, window_resolution=7, kernels=[5, 5, 5, 5],): super().__init__() self.dw0 = Residual(Conv2d_BN(ed, ed, 3, 1, 1, groups=ed, bn_weight_init=0., resolution=resolution)) self.ffn0 = Residual(FFN(ed, int(ed * 2), resolution)) if type == 's': self.mixer = Residual(LocalWindowAttention(ed, kd, nh, attn_ratio=ar, \ resolution=resolution, window_resolution=window_resolution, kernels=kernels)) self.dw1 = Residual(Conv2d_BN(ed, ed, 3, 1, 1, groups=ed, bn_weight_init=0., resolution=resolution)) self.ffn1 = Residual(FFN(ed, int(ed * 2), resolution)) def forward(self, x): return self.ffn1(self.dw1(self.mixer(self.ffn0(self.dw0(x))))) class EfficientViT(torch.nn.Module): def __init__(self, img_size=400, patch_size=16, frozen_stages=0, in_chans=3, stages=['s', 's', 's'], embed_dim=[64, 128, 192], key_dim=[16, 16, 16], depth=[1, 2, 3], num_heads=[4, 4, 4], window_size=[7, 7, 7], kernels=[5, 5, 5, 5], down_ops=[['subsample', 2], ['subsample', 2], ['']], pretrained=None, distillation=False,): super().__init__() resolution = img_size self.patch_embed = torch.nn.Sequential(Conv2d_BN(in_chans, embed_dim[0] // 8, 3, 2, 1, resolution=resolution), torch.nn.ReLU(), Conv2d_BN(embed_dim[0] // 8, embed_dim[0] // 4, 3, 2, 1, resolution=resolution // 2), torch.nn.ReLU(), Conv2d_BN(embed_dim[0] // 4, embed_dim[0] // 2, 3, 2, 1, resolution=resolution // 4), torch.nn.ReLU(), Conv2d_BN(embed_dim[0] // 2, embed_dim[0], 3, 1, 1, resolution=resolution // 8)) resolution = img_size // patch_size attn_ratio = [embed_dim[i] / (key_dim[i] * num_heads[i]) for i in range(len(embed_dim))] self.blocks1 = [] self.blocks2 = [] self.blocks3 = [] for i, (stg, ed, kd, dpth, nh, ar, wd, do) in enumerate( zip(stages, embed_dim, key_dim, depth, num_heads, attn_ratio, window_size, down_ops)): for d in range(dpth): eval('self.blocks' + str(i+1)).append(EfficientViTBlock(stg, ed, kd, nh, ar, resolution, wd, kernels)) if do[0] == 'subsample': #('Subsample' stride) blk = eval('self.blocks' + str(i+2)) resolution_ = (resolution - 1) // do[1] + 1 blk.append(torch.nn.Sequential(Residual(Conv2d_BN(embed_dim[i], embed_dim[i], 3, 1, 1, groups=embed_dim[i], resolution=resolution)), Residual(FFN(embed_dim[i], int(embed_dim[i] * 2), resolution)),)) blk.append(PatchMerging(*embed_dim[i:i + 2], resolution)) resolution = resolution_ blk.append(torch.nn.Sequential(Residual(Conv2d_BN(embed_dim[i + 1], embed_dim[i + 1], 3, 1, 1, groups=embed_dim[i + 1], resolution=resolution)), Residual(FFN(embed_dim[i + 1], int(embed_dim[i + 1] * 2), resolution)),)) self.blocks1 = torch.nn.Sequential(*self.blocks1) self.blocks2 = torch.nn.Sequential(*self.blocks2) self.blocks3 = torch.nn.Sequential(*self.blocks3) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x): outs = [] x = self.patch_embed(x) x = self.blocks1(x) outs.append(x) x = self.blocks2(x) outs.append(x) x = self.blocks3(x) outs.append(x) return outs EfficientViT_m0 = { 'img_size': 224, 'patch_size': 16, 'embed_dim': [64, 128, 192], 'depth': [1, 2, 3], 'num_heads': [4, 4, 4], 'window_size': [7, 7, 7], 'kernels': [7, 5, 3, 3], } EfficientViT_m1 = { 'img_size': 224, 'patch_size': 16, 'embed_dim': [128, 144, 192], 'depth': [1, 2, 3], 'num_heads': [2, 3, 3], 'window_size': [7, 7, 7], 'kernels': [7, 5, 3, 3], } EfficientViT_m2 = { 'img_size': 224, 'patch_size': 16, 'embed_dim': [128, 192, 224], 'depth': [1, 2, 3], 'num_heads': [4, 3, 2], 'window_size': [7, 7, 7], 'kernels': [7, 5, 3, 3], } EfficientViT_m3 = { 'img_size': 224, 'patch_size': 16, 'embed_dim': [128, 240, 320], 'depth': [1, 2, 3], 'num_heads': [4, 3, 4], 'window_size': [7, 7, 7], 'kernels': [5, 5, 5, 5], } EfficientViT_m4 = { 'img_size': 224, 'patch_size': 16, 'embed_dim': [128, 256, 384], 'depth': [1, 2, 3], 'num_heads': [4, 4, 4], 'window_size': [7, 7, 7], 'kernels': [7, 5, 3, 3], } EfficientViT_m5 = { 'img_size': 224, 'patch_size': 16, 'embed_dim': [192, 288, 384], 'depth': [1, 3, 4], 'num_heads': [3, 3, 4], 'window_size': [7, 7, 7], 'kernels': [7, 5, 3, 3], } def EfficientViT_M0(pretrained='', frozen_stages=0, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_m0): model = EfficientViT(frozen_stages=frozen_stages, distillation=distillation, pretrained=pretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)['model'])) if fuse: replace_batchnorm(model) return model def EfficientViT_M1(pretrained='', frozen_stages=0, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_m1): model = EfficientViT(frozen_stages=frozen_stages, distillation=distillation, pretrained=pretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)['model'])) if fuse: replace_batchnorm(model) return model def EfficientViT_M2(pretrained='', frozen_stages=0, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_m2): model = EfficientViT(frozen_stages=frozen_stages, distillation=distillation, pretrained=pretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)['model'])) if fuse: replace_batchnorm(model) return model def EfficientViT_M3(pretrained='', frozen_stages=0, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_m3): model = EfficientViT(frozen_stages=frozen_stages, distillation=distillation, pretrained=pretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)['model'])) if fuse: replace_batchnorm(model) return model def EfficientViT_M4(pretrained='', frozen_stages=0, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_m4): model = EfficientViT(frozen_stages=frozen_stages, distillation=distillation, pretrained=pretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)['model'])) if fuse: replace_batchnorm(model) return model def EfficientViT_M5(pretrained='', frozen_stages=0, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_m5): model = EfficientViT(frozen_stages=frozen_stages, distillation=distillation, pretrained=pretrained, **model_cfg) if pretrained: model.load_state_dict(update_weight(model.state_dict(), torch.load(pretrained)['model'])) if fuse: replace_batchnorm(model) return model def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): # k = k[9:] if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict if __name__ == '__main__': model = EfficientViT_M0('efficientvit_m0.pth') inputs = torch.randn((1, 3, 640, 640)) res = model(inputs) for i in res: print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/CVPR2024-StarNet/starnet.py ================================================ """ Implementation of Prof-of-Concept Network: StarNet. We make StarNet as simple as possible [to show the key contribution of element-wise multiplication]: - like NO layer-scale in network design, - and NO EMA during training, - which would improve the performance further. Created by: Xu Ma (Email: ma.xu1@northeastern.edu) Modified Date: Mar/29/2024 """ import torch import torch.nn as nn from timm.models.layers import DropPath, trunc_normal_ __all__ = ['starnet_s050', 'starnet_s100', 'starnet_s150', 'starnet_s1', 'starnet_s2', 'starnet_s3', 'starnet_s4'] model_urls = { "starnet_s1": "https://github.com/ma-xu/Rewrite-the-Stars/releases/download/checkpoints_v1/starnet_s1.pth.tar", "starnet_s2": "https://github.com/ma-xu/Rewrite-the-Stars/releases/download/checkpoints_v1/starnet_s2.pth.tar", "starnet_s3": "https://github.com/ma-xu/Rewrite-the-Stars/releases/download/checkpoints_v1/starnet_s3.pth.tar", "starnet_s4": "https://github.com/ma-xu/Rewrite-the-Stars/releases/download/checkpoints_v1/starnet_s4.pth.tar", } class ConvBN(torch.nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, with_bn=True): super().__init__() self.add_module('conv', torch.nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, dilation, groups)) if with_bn: self.add_module('bn', torch.nn.BatchNorm2d(out_planes)) torch.nn.init.constant_(self.bn.weight, 1) torch.nn.init.constant_(self.bn.bias, 0) class Block(nn.Module): def __init__(self, dim, mlp_ratio=3, drop_path=0.): super().__init__() self.dwconv = ConvBN(dim, dim, 7, 1, (7 - 1) // 2, groups=dim, with_bn=True) self.f1 = ConvBN(dim, mlp_ratio * dim, 1, with_bn=False) self.f2 = ConvBN(dim, mlp_ratio * dim, 1, with_bn=False) self.g = ConvBN(mlp_ratio * dim, dim, 1, with_bn=True) self.dwconv2 = ConvBN(dim, dim, 7, 1, (7 - 1) // 2, groups=dim, with_bn=False) self.act = nn.ReLU6() self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() def forward(self, x): input = x x = self.dwconv(x) x1, x2 = self.f1(x), self.f2(x) x = self.act(x1) * x2 x = self.dwconv2(self.g(x)) x = input + self.drop_path(x) return x class StarNet(nn.Module): def __init__(self, base_dim=32, depths=[3, 3, 12, 5], mlp_ratio=4, drop_path_rate=0.0, num_classes=1000, **kwargs): super().__init__() self.num_classes = num_classes self.in_channel = 32 # stem layer self.stem = nn.Sequential(ConvBN(3, self.in_channel, kernel_size=3, stride=2, padding=1), nn.ReLU6()) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth # build stages self.stages = nn.ModuleList() cur = 0 for i_layer in range(len(depths)): embed_dim = base_dim * 2 ** i_layer down_sampler = ConvBN(self.in_channel, embed_dim, 3, 2, 1) self.in_channel = embed_dim blocks = [Block(self.in_channel, mlp_ratio, dpr[cur + i]) for i in range(depths[i_layer])] cur += depths[i_layer] self.stages.append(nn.Sequential(down_sampler, *blocks)) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear or nn.Conv2d): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm or nn.BatchNorm2d): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): features = [] x = self.stem(x) features.append(x) for stage in self.stages: x = stage(x) features.append(x) return features def starnet_s1(pretrained=False, **kwargs): model = StarNet(24, [2, 2, 8, 3], **kwargs) if pretrained: url = model_urls['starnet_s1'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(checkpoint["state_dict"], strict=False) return model def starnet_s2(pretrained=False, **kwargs): model = StarNet(32, [1, 2, 6, 2], **kwargs) if pretrained: url = model_urls['starnet_s2'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(checkpoint["state_dict"], strict=False) return model def starnet_s3(pretrained=False, **kwargs): model = StarNet(32, [2, 2, 8, 4], **kwargs) if pretrained: url = model_urls['starnet_s3'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(checkpoint["state_dict"], strict=False) return model def starnet_s4(pretrained=False, **kwargs): model = StarNet(32, [3, 3, 12, 5], **kwargs) if pretrained: url = model_urls['starnet_s4'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(checkpoint["state_dict"], strict=False) return model # very small networks # def starnet_s050(pretrained=False, **kwargs): return StarNet(16, [1, 1, 3, 1], 3, **kwargs) def starnet_s100(pretrained=False, **kwargs): return StarNet(20, [1, 2, 4, 1], 4, **kwargs) def starnet_s150(pretrained=False, **kwargs): return StarNet(24, [1, 2, 4, 2], 3, **kwargs) ================================================ FILE: yolo-improve/yolov5-backbone/ConvNextV2/convnextv2.py ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from timm.models.layers import trunc_normal_, DropPath __all__ = ['convnextv2_atto', 'convnextv2_femto', 'convnextv2_pico', 'convnextv2_nano', 'convnextv2_tiny', 'convnextv2_base', 'convnextv2_large', 'convnextv2_huge'] class LayerNorm(nn.Module): """ LayerNorm that supports two data formats: channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). """ def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): super().__init__() self.weight = nn.Parameter(torch.ones(normalized_shape)) self.bias = nn.Parameter(torch.zeros(normalized_shape)) self.eps = eps self.data_format = data_format if self.data_format not in ["channels_last", "channels_first"]: raise NotImplementedError self.normalized_shape = (normalized_shape, ) def forward(self, x): if self.data_format == "channels_last": return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) elif self.data_format == "channels_first": u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / torch.sqrt(s + self.eps) x = self.weight[:, None, None] * x + self.bias[:, None, None] return x class GRN(nn.Module): """ GRN (Global Response Normalization) layer """ def __init__(self, dim): super().__init__() self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim)) self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim)) def forward(self, x): Gx = torch.norm(x, p=2, dim=(1,2), keepdim=True) Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6) return self.gamma * (x * Nx) + self.beta + x class Block(nn.Module): """ ConvNeXtV2 Block. Args: dim (int): Number of input channels. drop_path (float): Stochastic depth rate. Default: 0.0 """ def __init__(self, dim, drop_path=0.): super().__init__() self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv self.norm = LayerNorm(dim, eps=1e-6) self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers self.act = nn.GELU() self.grn = GRN(4 * dim) self.pwconv2 = nn.Linear(4 * dim, dim) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() def forward(self, x): input = x x = self.dwconv(x) x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) x = self.norm(x) x = self.pwconv1(x) x = self.act(x) x = self.grn(x) x = self.pwconv2(x) x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) x = input + self.drop_path(x) return x class ConvNeXtV2(nn.Module): """ ConvNeXt V2 Args: in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] drop_path_rate (float): Stochastic depth rate. Default: 0. head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. """ def __init__(self, in_chans=3, num_classes=1000, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0., head_init_scale=1. ): super().__init__() self.depths = depths self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers stem = nn.Sequential( nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), LayerNorm(dims[0], eps=1e-6, data_format="channels_first") ) self.downsample_layers.append(stem) for i in range(3): downsample_layer = nn.Sequential( LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), ) self.downsample_layers.append(downsample_layer) self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] cur = 0 for i in range(4): stage = nn.Sequential( *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])] ) self.stages.append(stage) cur += depths[i] self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer self.head = nn.Linear(dims[-1], num_classes) self.apply(self._init_weights) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def _init_weights(self, m): if isinstance(m, (nn.Conv2d, nn.Linear)): trunc_normal_(m.weight, std=.02) nn.init.constant_(m.bias, 0) def forward(self, x): res = [] for i in range(4): x = self.downsample_layers[i](x) x = self.stages[i](x) res.append(x) return res def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def convnextv2_atto(weights='', **kwargs): model = ConvNeXtV2(depths=[2, 2, 6, 2], dims=[40, 80, 160, 320], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_femto(weights='', **kwargs): model = ConvNeXtV2(depths=[2, 2, 6, 2], dims=[48, 96, 192, 384], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_pico(weights='', **kwargs): model = ConvNeXtV2(depths=[2, 2, 6, 2], dims=[64, 128, 256, 512], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_nano(weights='', **kwargs): model = ConvNeXtV2(depths=[2, 2, 8, 2], dims=[80, 160, 320, 640], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_tiny(weights='', **kwargs): model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_base(weights='', **kwargs): model = ConvNeXtV2(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_large(weights='', **kwargs): model = ConvNeXtV2(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def convnextv2_huge(weights='', **kwargs): model = ConvNeXtV2(depths=[3, 3, 27, 3], dims=[352, 704, 1408, 2816], **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model ================================================ FILE: yolo-improve/yolov5-backbone/EMO/emo.py ================================================ import math import numpy as np import torch.nn as nn from einops import rearrange, reduce from timm.models.layers.activations import * from timm.models.layers import DropPath, trunc_normal_, create_attn from timm.models.efficientnet_blocks import num_groups, SqueezeExcite as SE from functools import partial __all__ = ['EMO_1M', 'EMO_2M', 'EMO_5M', 'EMO_6M'] inplace = True def get_act(act_layer='relu'): act_dict = { 'none': nn.Identity, 'sigmoid': Sigmoid, 'swish': Swish, 'mish': Mish, 'hsigmoid': HardSigmoid, 'hswish': HardSwish, 'hmish': HardMish, 'tanh': Tanh, 'relu': nn.ReLU, 'relu6': nn.ReLU6, 'prelu': PReLU, 'gelu': GELU, 'silu': nn.SiLU } return act_dict[act_layer] class LayerNorm2d(nn.Module): def __init__(self, normalized_shape, eps=1e-6, elementwise_affine=True): super().__init__() self.norm = nn.LayerNorm(normalized_shape, eps, elementwise_affine) def forward(self, x): x = rearrange(x, 'b c h w -> b h w c').contiguous() x = self.norm(x) x = rearrange(x, 'b h w c -> b c h w').contiguous() return x def get_norm(norm_layer='in_1d'): eps = 1e-6 norm_dict = { 'none': nn.Identity, 'in_1d': partial(nn.InstanceNorm1d, eps=eps), 'in_2d': partial(nn.InstanceNorm2d, eps=eps), 'in_3d': partial(nn.InstanceNorm3d, eps=eps), 'bn_1d': partial(nn.BatchNorm1d, eps=eps), 'bn_2d': partial(nn.BatchNorm2d, eps=eps), 'bn_3d': partial(nn.BatchNorm3d, eps=eps), 'gn': partial(nn.GroupNorm, eps=eps), 'ln_1d': partial(nn.LayerNorm, eps=eps), 'ln_2d': partial(LayerNorm2d, eps=eps), } return norm_dict[norm_layer] class ConvNormAct(nn.Module): def __init__(self, dim_in, dim_out, kernel_size, stride=1, dilation=1, groups=1, bias=False, skip=False, norm_layer='bn_2d', act_layer='relu', inplace=True, drop_path_rate=0.): super(ConvNormAct, self).__init__() self.has_skip = skip and dim_in == dim_out padding = math.ceil((kernel_size - stride) / 2) self.conv = nn.Conv2d(dim_in, dim_out, kernel_size, stride, padding, dilation, groups, bias) self.norm = get_norm(norm_layer)(dim_out) self.act = get_act(act_layer)(inplace=inplace) self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity() def forward(self, x): shortcut = x x = self.conv(x) x = self.norm(x) x = self.act(x) if self.has_skip: x = self.drop_path(x) + shortcut return x inplace = True # ========== Multi-Scale Populations, for down-sampling and inductive bias ========== class MSPatchEmb(nn.Module): def __init__(self, dim_in, emb_dim, kernel_size=2, c_group=-1, stride=1, dilations=[1, 2, 3], norm_layer='bn_2d', act_layer='silu'): super().__init__() self.dilation_num = len(dilations) assert dim_in % c_group == 0 c_group = math.gcd(dim_in, emb_dim) if c_group == -1 else c_group self.convs = nn.ModuleList() for i in range(len(dilations)): padding = math.ceil(((kernel_size - 1) * dilations[i] + 1 - stride) / 2) self.convs.append(nn.Sequential(nn.Conv2d(dim_in, emb_dim, kernel_size, stride, padding, dilations[i], groups=c_group), get_norm(norm_layer)(emb_dim), get_act(act_layer)(emb_dim))) def forward(self, x): if self.dilation_num == 1: x = self.convs[0](x) else: x = torch.cat([self.convs[i](x).unsqueeze(dim=-1) for i in range(self.dilation_num)], dim=-1) x = reduce(x, 'b c h w n -> b c h w', 'mean').contiguous() return x class iRMB(nn.Module): def __init__(self, dim_in, dim_out, norm_in=True, has_skip=True, exp_ratio=1.0, norm_layer='bn_2d', act_layer='relu', v_proj=True, dw_ks=3, stride=1, dilation=1, se_ratio=0.0, dim_head=64, window_size=7, attn_s=True, qkv_bias=False, attn_drop=0., drop=0., drop_path=0., v_group=False, attn_pre=False): super().__init__() self.norm = get_norm(norm_layer)(dim_in) if norm_in else nn.Identity() dim_mid = int(dim_in * exp_ratio) self.has_skip = (dim_in == dim_out and stride == 1) and has_skip self.attn_s = attn_s if self.attn_s: assert dim_in % dim_head == 0, 'dim should be divisible by num_heads' self.dim_head = dim_head self.window_size = window_size self.num_head = dim_in // dim_head self.scale = self.dim_head ** -0.5 self.attn_pre = attn_pre self.qk = ConvNormAct(dim_in, int(dim_in * 2), kernel_size=1, bias=qkv_bias, norm_layer='none', act_layer='none') self.v = ConvNormAct(dim_in, dim_mid, kernel_size=1, groups=self.num_head if v_group else 1, bias=qkv_bias, norm_layer='none', act_layer=act_layer, inplace=inplace) self.attn_drop = nn.Dropout(attn_drop) else: if v_proj: self.v = ConvNormAct(dim_in, dim_mid, kernel_size=1, bias=qkv_bias, norm_layer='none', act_layer=act_layer, inplace=inplace) else: self.v = nn.Identity() self.conv_local = ConvNormAct(dim_mid, dim_mid, kernel_size=dw_ks, stride=stride, dilation=dilation, groups=dim_mid, norm_layer='bn_2d', act_layer='silu', inplace=inplace) self.se = SE(dim_mid, rd_ratio=se_ratio, act_layer=get_act(act_layer)) if se_ratio > 0.0 else nn.Identity() self.proj_drop = nn.Dropout(drop) self.proj = ConvNormAct(dim_mid, dim_out, kernel_size=1, norm_layer='none', act_layer='none', inplace=inplace) self.drop_path = DropPath(drop_path) if drop_path else nn.Identity() def forward(self, x): shortcut = x x = self.norm(x) B, C, H, W = x.shape if self.attn_s: # padding if self.window_size <= 0: window_size_W, window_size_H = W, H else: window_size_W, window_size_H = self.window_size, self.window_size pad_l, pad_t = 0, 0 pad_r = (window_size_W - W % window_size_W) % window_size_W pad_b = (window_size_H - H % window_size_H) % window_size_H x = F.pad(x, (pad_l, pad_r, pad_t, pad_b, 0, 0,)) n1, n2 = (H + pad_b) // window_size_H, (W + pad_r) // window_size_W x = rearrange(x, 'b c (h1 n1) (w1 n2) -> (b n1 n2) c h1 w1', n1=n1, n2=n2).contiguous() # attention b, c, h, w = x.shape qk = self.qk(x) qk = rearrange(qk, 'b (qk heads dim_head) h w -> qk b heads (h w) dim_head', qk=2, heads=self.num_head, dim_head=self.dim_head).contiguous() q, k = qk[0], qk[1] attn_spa = (q @ k.transpose(-2, -1)) * self.scale attn_spa = attn_spa.softmax(dim=-1) attn_spa = self.attn_drop(attn_spa) if self.attn_pre: x = rearrange(x, 'b (heads dim_head) h w -> b heads (h w) dim_head', heads=self.num_head).contiguous() x_spa = attn_spa @ x x_spa = rearrange(x_spa, 'b heads (h w) dim_head -> b (heads dim_head) h w', heads=self.num_head, h=h, w=w).contiguous() x_spa = self.v(x_spa) else: v = self.v(x) v = rearrange(v, 'b (heads dim_head) h w -> b heads (h w) dim_head', heads=self.num_head).contiguous() x_spa = attn_spa @ v x_spa = rearrange(x_spa, 'b heads (h w) dim_head -> b (heads dim_head) h w', heads=self.num_head, h=h, w=w).contiguous() # unpadding x = rearrange(x_spa, '(b n1 n2) c h1 w1 -> b c (h1 n1) (w1 n2)', n1=n1, n2=n2).contiguous() if pad_r > 0 or pad_b > 0: x = x[:, :, :H, :W].contiguous() else: x = self.v(x) x = x + self.se(self.conv_local(x)) if self.has_skip else self.se(self.conv_local(x)) x = self.proj_drop(x) x = self.proj(x) x = (shortcut + self.drop_path(x)) if self.has_skip else x return x class EMO(nn.Module): def __init__(self, dim_in=3, num_classes=1000, img_size=224, depths=[1, 2, 4, 2], stem_dim=16, embed_dims=[64, 128, 256, 512], exp_ratios=[4., 4., 4., 4.], norm_layers=['bn_2d', 'bn_2d', 'bn_2d', 'bn_2d'], act_layers=['relu', 'relu', 'relu', 'relu'], dw_kss=[3, 3, 5, 5], se_ratios=[0.0, 0.0, 0.0, 0.0], dim_heads=[32, 32, 32, 32], window_sizes=[7, 7, 7, 7], attn_ss=[False, False, True, True], qkv_bias=True, attn_drop=0., drop=0., drop_path=0., v_group=False, attn_pre=False, pre_dim=0): super().__init__() self.num_classes = num_classes assert num_classes > 0 dprs = [x.item() for x in torch.linspace(0, drop_path, sum(depths))] self.stage0 = nn.ModuleList([ MSPatchEmb( # down to 112 dim_in, stem_dim, kernel_size=dw_kss[0], c_group=1, stride=2, dilations=[1], norm_layer=norm_layers[0], act_layer='none'), iRMB( # ds stem_dim, stem_dim, norm_in=False, has_skip=False, exp_ratio=1, norm_layer=norm_layers[0], act_layer=act_layers[0], v_proj=False, dw_ks=dw_kss[0], stride=1, dilation=1, se_ratio=1, dim_head=dim_heads[0], window_size=window_sizes[0], attn_s=False, qkv_bias=qkv_bias, attn_drop=attn_drop, drop=drop, drop_path=0., attn_pre=attn_pre ) ]) emb_dim_pre = stem_dim for i in range(len(depths)): layers = [] dpr = dprs[sum(depths[:i]):sum(depths[:i + 1])] for j in range(depths[i]): if j == 0: stride, has_skip, attn_s, exp_ratio = 2, False, False, exp_ratios[i] * 2 else: stride, has_skip, attn_s, exp_ratio = 1, True, attn_ss[i], exp_ratios[i] layers.append(iRMB( emb_dim_pre, embed_dims[i], norm_in=True, has_skip=has_skip, exp_ratio=exp_ratio, norm_layer=norm_layers[i], act_layer=act_layers[i], v_proj=True, dw_ks=dw_kss[i], stride=stride, dilation=1, se_ratio=se_ratios[i], dim_head=dim_heads[i], window_size=window_sizes[i], attn_s=attn_s, qkv_bias=qkv_bias, attn_drop=attn_drop, drop=drop, drop_path=dpr[j], v_group=v_group, attn_pre=attn_pre )) emb_dim_pre = embed_dims[i] self.__setattr__(f'stage{i + 1}', nn.ModuleList(layers)) self.norm = get_norm(norm_layers[-1])(embed_dims[-1]) self.apply(self._init_weights) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)): nn.init.zeros_(m.bias) nn.init.ones_(m.weight) @torch.jit.ignore def no_weight_decay(self): return {'token'} @torch.jit.ignore def no_weight_decay_keywords(self): return {'alpha', 'gamma', 'beta'} @torch.jit.ignore def no_ft_keywords(self): # return {'head.weight', 'head.bias'} return {} @torch.jit.ignore def ft_head_keywords(self): return {'head.weight', 'head.bias'}, self.num_classes def get_classifier(self): return self.head def reset_classifier(self, num_classes): self.num_classes = num_classes self.head = nn.Linear(self.pre_dim, num_classes) if num_classes > 0 else nn.Identity() def check_bn(self): for name, m in self.named_modules(): if isinstance(m, nn.modules.batchnorm._NormBase): m.running_mean = torch.nan_to_num(m.running_mean, nan=0, posinf=1, neginf=-1) m.running_var = torch.nan_to_num(m.running_var, nan=0, posinf=1, neginf=-1) def forward_features(self, x): for blk in self.stage0: x = blk(x) x1 = x for blk in self.stage1: x = blk(x) x2 = x for blk in self.stage2: x = blk(x) x3 = x for blk in self.stage3: x = blk(x) x4 = x for blk in self.stage4: x = blk(x) x5 = x return [x1, x2, x3, x4, x5] def forward(self, x): x = self.forward_features(x) x[-1] = self.norm(x[-1]) return x def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def EMO_1M(weights='', **kwargs): model = EMO( # dim_in=3, num_classes=1000, img_size=224, depths=[2, 2, 8, 3], stem_dim=24, embed_dims=[32, 48, 80, 168], exp_ratios=[2., 2.5, 3.0, 3.5], norm_layers=['bn_2d', 'bn_2d', 'ln_2d', 'ln_2d'], act_layers=['silu', 'silu', 'gelu', 'gelu'], dw_kss=[3, 3, 5, 5], dim_heads=[16, 16, 20, 21], window_sizes=[7, 7, 7, 7], attn_ss=[False, False, True, True], qkv_bias=True, attn_drop=0., drop=0., drop_path=0.04036, v_group=False, attn_pre=True, pre_dim=0, **kwargs) if weights: pretrained_weight = torch.load(weights) model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def EMO_2M(weights='', **kwargs): model = EMO( # dim_in=3, num_classes=1000, img_size=224, depths=[3, 3, 9, 3], stem_dim=24, embed_dims=[32, 48, 120, 200], exp_ratios=[2., 2.5, 3.0, 3.5], norm_layers=['bn_2d', 'bn_2d', 'ln_2d', 'ln_2d'], act_layers=['silu', 'silu', 'gelu', 'gelu'], dw_kss=[3, 3, 5, 5], dim_heads=[16, 16, 20, 20], window_sizes=[7, 7, 7, 7], attn_ss=[False, False, True, True], qkv_bias=True, attn_drop=0., drop=0., drop_path=0.05, v_group=False, attn_pre=True, pre_dim=0, **kwargs) if weights: pretrained_weight = torch.load(weights) model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def EMO_5M(weights='', **kwargs): model = EMO( # dim_in=3, num_classes=1000, img_size=224, depths=[3, 3, 9, 3], stem_dim=24, embed_dims=[48, 72, 160, 288], exp_ratios=[2., 3., 4., 4.], norm_layers=['bn_2d', 'bn_2d', 'ln_2d', 'ln_2d'], act_layers=['silu', 'silu', 'gelu', 'gelu'], dw_kss=[3, 3, 5, 5], dim_heads=[24, 24, 32, 32], window_sizes=[7, 7, 7, 7], attn_ss=[False, False, True, True], qkv_bias=True, attn_drop=0., drop=0., drop_path=0.05, v_group=False, attn_pre=True, pre_dim=0, **kwargs) if weights: pretrained_weight = torch.load(weights) model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def EMO_6M(weights='', **kwargs): model = EMO( # dim_in=3, num_classes=1000, img_size=224, depths=[3, 3, 9, 3], stem_dim=24, embed_dims=[48, 72, 160, 320], exp_ratios=[2., 3., 4., 5.], norm_layers=['bn_2d', 'bn_2d', 'ln_2d', 'ln_2d'], act_layers=['silu', 'silu', 'gelu', 'gelu'], dw_kss=[3, 3, 5, 5], dim_heads=[16, 24, 20, 32], window_sizes=[7, 7, 7, 7], attn_ss=[False, False, True, True], qkv_bias=True, attn_drop=0., drop=0., drop_path=0.05, v_group=False, attn_pre=True, pre_dim=0, **kwargs) if weights: pretrained_weight = torch.load(weights) model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model if __name__ == '__main__': model = EMO_1M('EMO_1M/net.pth') model = EMO_2M('EMO_2M/net.pth') model = EMO_5M('EMO_5M/net.pth') model = EMO_6M('EMO_6M/net.pth') ================================================ FILE: yolo-improve/yolov5-backbone/EfficientFormerV2/EfficientFormerV2.py ================================================ """ EfficientFormer_v2 """ import os import copy import torch import torch.nn as nn import torch.nn.functional as F import math from typing import Dict import itertools import numpy as np from timm.models.layers import DropPath, trunc_normal_, to_2tuple __all__ = ['efficientformerv2_s0', 'efficientformerv2_s1', 'efficientformerv2_s2', 'efficientformerv2_l'] EfficientFormer_width = { 'L': [40, 80, 192, 384], # 26m 83.3% 6attn 'S2': [32, 64, 144, 288], # 12m 81.6% 4attn dp0.02 'S1': [32, 48, 120, 224], # 6.1m 79.0 'S0': [32, 48, 96, 176], # 75.0 75.7 } EfficientFormer_depth = { 'L': [5, 5, 15, 10], # 26m 83.3% 'S2': [4, 4, 12, 8], # 12m 'S1': [3, 3, 9, 6], # 79.0 'S0': [2, 2, 6, 4], # 75.7 } # 26m expansion_ratios_L = { '0': [4, 4, 4, 4, 4], '1': [4, 4, 4, 4, 4], '2': [4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4], '3': [4, 4, 4, 3, 3, 3, 3, 4, 4, 4], } # 12m expansion_ratios_S2 = { '0': [4, 4, 4, 4], '1': [4, 4, 4, 4], '2': [4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4], '3': [4, 4, 3, 3, 3, 3, 4, 4], } # 6.1m expansion_ratios_S1 = { '0': [4, 4, 4], '1': [4, 4, 4], '2': [4, 4, 3, 3, 3, 3, 4, 4, 4], '3': [4, 4, 3, 3, 4, 4], } # 3.5m expansion_ratios_S0 = { '0': [4, 4], '1': [4, 4], '2': [4, 3, 3, 3, 4, 4], '3': [4, 3, 3, 4], } class Attention4D(torch.nn.Module): def __init__(self, dim=384, key_dim=32, num_heads=8, attn_ratio=4, resolution=7, act_layer=nn.ReLU, stride=None): super().__init__() self.num_heads = num_heads self.scale = key_dim ** -0.5 self.key_dim = key_dim self.nh_kd = nh_kd = key_dim * num_heads if stride is not None: self.resolution = math.ceil(resolution / stride) self.stride_conv = nn.Sequential(nn.Conv2d(dim, dim, kernel_size=3, stride=stride, padding=1, groups=dim), nn.BatchNorm2d(dim), ) self.upsample = nn.Upsample(scale_factor=stride, mode='bilinear') else: self.resolution = resolution self.stride_conv = None self.upsample = None self.N = self.resolution ** 2 self.N2 = self.N self.d = int(attn_ratio * key_dim) self.dh = int(attn_ratio * key_dim) * num_heads self.attn_ratio = attn_ratio h = self.dh + nh_kd * 2 self.q = nn.Sequential(nn.Conv2d(dim, self.num_heads * self.key_dim, 1), nn.BatchNorm2d(self.num_heads * self.key_dim), ) self.k = nn.Sequential(nn.Conv2d(dim, self.num_heads * self.key_dim, 1), nn.BatchNorm2d(self.num_heads * self.key_dim), ) self.v = nn.Sequential(nn.Conv2d(dim, self.num_heads * self.d, 1), nn.BatchNorm2d(self.num_heads * self.d), ) self.v_local = nn.Sequential(nn.Conv2d(self.num_heads * self.d, self.num_heads * self.d, kernel_size=3, stride=1, padding=1, groups=self.num_heads * self.d), nn.BatchNorm2d(self.num_heads * self.d), ) self.talking_head1 = nn.Conv2d(self.num_heads, self.num_heads, kernel_size=1, stride=1, padding=0) self.talking_head2 = nn.Conv2d(self.num_heads, self.num_heads, kernel_size=1, stride=1, padding=0) self.proj = nn.Sequential(act_layer(), nn.Conv2d(self.dh, dim, 1), nn.BatchNorm2d(dim), ) points = list(itertools.product(range(self.resolution), range(self.resolution))) N = len(points) attention_offsets = {} idxs = [] for p1 in points: for p2 in points: offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) if offset not in attention_offsets: attention_offsets[offset] = len(attention_offsets) idxs.append(attention_offsets[offset]) self.attention_biases = torch.nn.Parameter( torch.zeros(num_heads, len(attention_offsets))) self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N, N)) @torch.no_grad() def train(self, mode=True): super().train(mode) if mode and hasattr(self, 'ab'): del self.ab else: self.ab = self.attention_biases[:, self.attention_bias_idxs] def forward(self, x): # x (B,N,C) B, C, H, W = x.shape if self.stride_conv is not None: x = self.stride_conv(x) q = self.q(x).flatten(2).reshape(B, self.num_heads, -1, self.N).permute(0, 1, 3, 2) k = self.k(x).flatten(2).reshape(B, self.num_heads, -1, self.N).permute(0, 1, 2, 3) v = self.v(x) v_local = self.v_local(v) v = v.flatten(2).reshape(B, self.num_heads, -1, self.N).permute(0, 1, 3, 2) attn = ( (q @ k) * self.scale + (self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab) ) # attn = (q @ k) * self.scale attn = self.talking_head1(attn) attn = attn.softmax(dim=-1) attn = self.talking_head2(attn) x = (attn @ v) out = x.transpose(2, 3).reshape(B, self.dh, self.resolution, self.resolution) + v_local if self.upsample is not None: out = self.upsample(out) out = self.proj(out) return out def stem(in_chs, out_chs, act_layer=nn.ReLU): return nn.Sequential( nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1), nn.BatchNorm2d(out_chs // 2), act_layer(), nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1), nn.BatchNorm2d(out_chs), act_layer(), ) class LGQuery(torch.nn.Module): def __init__(self, in_dim, out_dim, resolution1, resolution2): super().__init__() self.resolution1 = resolution1 self.resolution2 = resolution2 self.pool = nn.AvgPool2d(1, 2, 0) self.local = nn.Sequential(nn.Conv2d(in_dim, in_dim, kernel_size=3, stride=2, padding=1, groups=in_dim), ) self.proj = nn.Sequential(nn.Conv2d(in_dim, out_dim, 1), nn.BatchNorm2d(out_dim), ) def forward(self, x): local_q = self.local(x) pool_q = self.pool(x) q = local_q + pool_q q = self.proj(q) return q class Attention4DDownsample(torch.nn.Module): def __init__(self, dim=384, key_dim=16, num_heads=8, attn_ratio=4, resolution=7, out_dim=None, act_layer=None, ): super().__init__() self.num_heads = num_heads self.scale = key_dim ** -0.5 self.key_dim = key_dim self.nh_kd = nh_kd = key_dim * num_heads self.resolution = resolution self.d = int(attn_ratio * key_dim) self.dh = int(attn_ratio * key_dim) * num_heads self.attn_ratio = attn_ratio h = self.dh + nh_kd * 2 if out_dim is not None: self.out_dim = out_dim else: self.out_dim = dim self.resolution2 = math.ceil(self.resolution / 2) self.q = LGQuery(dim, self.num_heads * self.key_dim, self.resolution, self.resolution2) self.N = self.resolution ** 2 self.N2 = self.resolution2 ** 2 self.k = nn.Sequential(nn.Conv2d(dim, self.num_heads * self.key_dim, 1), nn.BatchNorm2d(self.num_heads * self.key_dim), ) self.v = nn.Sequential(nn.Conv2d(dim, self.num_heads * self.d, 1), nn.BatchNorm2d(self.num_heads * self.d), ) self.v_local = nn.Sequential(nn.Conv2d(self.num_heads * self.d, self.num_heads * self.d, kernel_size=3, stride=2, padding=1, groups=self.num_heads * self.d), nn.BatchNorm2d(self.num_heads * self.d), ) self.proj = nn.Sequential( act_layer(), nn.Conv2d(self.dh, self.out_dim, 1), nn.BatchNorm2d(self.out_dim), ) points = list(itertools.product(range(self.resolution), range(self.resolution))) points_ = list(itertools.product( range(self.resolution2), range(self.resolution2))) N = len(points) N_ = len(points_) attention_offsets = {} idxs = [] for p1 in points_: for p2 in points: size = 1 offset = ( abs(p1[0] * math.ceil(self.resolution / self.resolution2) - p2[0] + (size - 1) / 2), abs(p1[1] * math.ceil(self.resolution / self.resolution2) - p2[1] + (size - 1) / 2)) if offset not in attention_offsets: attention_offsets[offset] = len(attention_offsets) idxs.append(attention_offsets[offset]) self.attention_biases = torch.nn.Parameter( torch.zeros(num_heads, len(attention_offsets))) self.register_buffer('attention_bias_idxs', torch.LongTensor(idxs).view(N_, N)) @torch.no_grad() def train(self, mode=True): super().train(mode) if mode and hasattr(self, 'ab'): del self.ab else: self.ab = self.attention_biases[:, self.attention_bias_idxs] def forward(self, x): # x (B,N,C) B, C, H, W = x.shape q = self.q(x).flatten(2).reshape(B, self.num_heads, -1, self.N2).permute(0, 1, 3, 2) k = self.k(x).flatten(2).reshape(B, self.num_heads, -1, self.N).permute(0, 1, 2, 3) v = self.v(x) v_local = self.v_local(v) v = v.flatten(2).reshape(B, self.num_heads, -1, self.N).permute(0, 1, 3, 2) attn = ( (q @ k) * self.scale + (self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab) ) # attn = (q @ k) * self.scale attn = attn.softmax(dim=-1) x = (attn @ v).transpose(2, 3) out = x.reshape(B, self.dh, self.resolution2, self.resolution2) + v_local out = self.proj(out) return out class Embedding(nn.Module): def __init__(self, patch_size=3, stride=2, padding=1, in_chans=3, embed_dim=768, norm_layer=nn.BatchNorm2d, light=False, asub=False, resolution=None, act_layer=nn.ReLU, attn_block=Attention4DDownsample): super().__init__() self.light = light self.asub = asub if self.light: self.new_proj = nn.Sequential( nn.Conv2d(in_chans, in_chans, kernel_size=3, stride=2, padding=1, groups=in_chans), nn.BatchNorm2d(in_chans), nn.Hardswish(), nn.Conv2d(in_chans, embed_dim, kernel_size=1, stride=1, padding=0), nn.BatchNorm2d(embed_dim), ) self.skip = nn.Sequential( nn.Conv2d(in_chans, embed_dim, kernel_size=1, stride=2, padding=0), nn.BatchNorm2d(embed_dim) ) elif self.asub: self.attn = attn_block(dim=in_chans, out_dim=embed_dim, resolution=resolution, act_layer=act_layer) patch_size = to_2tuple(patch_size) stride = to_2tuple(stride) padding = to_2tuple(padding) self.conv = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) self.bn = norm_layer(embed_dim) if norm_layer else nn.Identity() else: patch_size = to_2tuple(patch_size) stride = to_2tuple(stride) padding = to_2tuple(padding) self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): if self.light: out = self.new_proj(x) + self.skip(x) elif self.asub: out_conv = self.conv(x) out_conv = self.bn(out_conv) out = self.attn(x) + out_conv else: x = self.proj(x) out = self.norm(x) return out class Mlp(nn.Module): """ Implementation of MLP with 1*1 convolutions. Input: tensor with shape [B, C, H, W] """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., mid_conv=False): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.mid_conv = mid_conv self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.act = act_layer() self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) self.apply(self._init_weights) if self.mid_conv: self.mid = nn.Conv2d(hidden_features, hidden_features, kernel_size=3, stride=1, padding=1, groups=hidden_features) self.mid_norm = nn.BatchNorm2d(hidden_features) self.norm1 = nn.BatchNorm2d(hidden_features) self.norm2 = nn.BatchNorm2d(out_features) def _init_weights(self, m): if isinstance(m, nn.Conv2d): trunc_normal_(m.weight, std=.02) if m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x): x = self.fc1(x) x = self.norm1(x) x = self.act(x) if self.mid_conv: x_mid = self.mid(x) x_mid = self.mid_norm(x_mid) x = self.act(x_mid) x = self.drop(x) x = self.fc2(x) x = self.norm2(x) x = self.drop(x) return x class AttnFFN(nn.Module): def __init__(self, dim, mlp_ratio=4., act_layer=nn.ReLU, norm_layer=nn.LayerNorm, drop=0., drop_path=0., use_layer_scale=True, layer_scale_init_value=1e-5, resolution=7, stride=None): super().__init__() self.token_mixer = Attention4D(dim, resolution=resolution, act_layer=act_layer, stride=stride) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mid_conv=True) self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.use_layer_scale = use_layer_scale if use_layer_scale: self.layer_scale_1 = nn.Parameter( layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True) self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True) def forward(self, x): if self.use_layer_scale: x = x + self.drop_path(self.layer_scale_1 * self.token_mixer(x)) x = x + self.drop_path(self.layer_scale_2 * self.mlp(x)) else: x = x + self.drop_path(self.token_mixer(x)) x = x + self.drop_path(self.mlp(x)) return x class FFN(nn.Module): def __init__(self, dim, pool_size=3, mlp_ratio=4., act_layer=nn.GELU, drop=0., drop_path=0., use_layer_scale=True, layer_scale_init_value=1e-5): super().__init__() mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mid_conv=True) self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.use_layer_scale = use_layer_scale if use_layer_scale: self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True) def forward(self, x): if self.use_layer_scale: x = x + self.drop_path(self.layer_scale_2 * self.mlp(x)) else: x = x + self.drop_path(self.mlp(x)) return x def eformer_block(dim, index, layers, pool_size=3, mlp_ratio=4., act_layer=nn.GELU, norm_layer=nn.LayerNorm, drop_rate=.0, drop_path_rate=0., use_layer_scale=True, layer_scale_init_value=1e-5, vit_num=1, resolution=7, e_ratios=None): blocks = [] for block_idx in range(layers[index]): block_dpr = drop_path_rate * ( block_idx + sum(layers[:index])) / (sum(layers) - 1) mlp_ratio = e_ratios[str(index)][block_idx] if index >= 2 and block_idx > layers[index] - 1 - vit_num: if index == 2: stride = 2 else: stride = None blocks.append(AttnFFN( dim, mlp_ratio=mlp_ratio, act_layer=act_layer, norm_layer=norm_layer, drop=drop_rate, drop_path=block_dpr, use_layer_scale=use_layer_scale, layer_scale_init_value=layer_scale_init_value, resolution=resolution, stride=stride, )) else: blocks.append(FFN( dim, pool_size=pool_size, mlp_ratio=mlp_ratio, act_layer=act_layer, drop=drop_rate, drop_path=block_dpr, use_layer_scale=use_layer_scale, layer_scale_init_value=layer_scale_init_value, )) blocks = nn.Sequential(*blocks) return blocks class EfficientFormerV2(nn.Module): def __init__(self, layers, embed_dims=None, mlp_ratios=4, downsamples=None, pool_size=3, norm_layer=nn.BatchNorm2d, act_layer=nn.GELU, num_classes=1000, down_patch_size=3, down_stride=2, down_pad=1, drop_rate=0., drop_path_rate=0., use_layer_scale=True, layer_scale_init_value=1e-5, fork_feat=True, vit_num=0, resolution=640, e_ratios=expansion_ratios_L, **kwargs): super().__init__() if not fork_feat: self.num_classes = num_classes self.fork_feat = fork_feat self.patch_embed = stem(3, embed_dims[0], act_layer=act_layer) network = [] for i in range(len(layers)): stage = eformer_block(embed_dims[i], i, layers, pool_size=pool_size, mlp_ratio=mlp_ratios, act_layer=act_layer, norm_layer=norm_layer, drop_rate=drop_rate, drop_path_rate=drop_path_rate, use_layer_scale=use_layer_scale, layer_scale_init_value=layer_scale_init_value, resolution=math.ceil(resolution / (2 ** (i + 2))), vit_num=vit_num, e_ratios=e_ratios) network.append(stage) if i >= len(layers) - 1: break if downsamples[i] or embed_dims[i] != embed_dims[i + 1]: # downsampling between two stages if i >= 2: asub = True else: asub = False network.append( Embedding( patch_size=down_patch_size, stride=down_stride, padding=down_pad, in_chans=embed_dims[i], embed_dim=embed_dims[i + 1], resolution=math.ceil(resolution / (2 ** (i + 2))), asub=asub, act_layer=act_layer, norm_layer=norm_layer, ) ) self.network = nn.ModuleList(network) if self.fork_feat: # add a norm layer for each output self.out_indices = [0, 2, 4, 6] for i_emb, i_layer in enumerate(self.out_indices): if i_emb == 0 and os.environ.get('FORK_LAST3', None): layer = nn.Identity() else: layer = norm_layer(embed_dims[i_emb]) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, resolution, resolution))] def forward_tokens(self, x): outs = [] for idx, block in enumerate(self.network): x = block(x) if self.fork_feat and idx in self.out_indices: norm_layer = getattr(self, f'norm{idx}') x_out = norm_layer(x) outs.append(x_out) return outs def forward(self, x): x = self.patch_embed(x) x = self.forward_tokens(x) return x def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def efficientformerv2_s0(weights='', **kwargs): model = EfficientFormerV2( layers=EfficientFormer_depth['S0'], embed_dims=EfficientFormer_width['S0'], downsamples=[True, True, True, True, True], vit_num=2, drop_path_rate=0.0, e_ratios=expansion_ratios_S0, **kwargs) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def efficientformerv2_s1(weights='', **kwargs): model = EfficientFormerV2( layers=EfficientFormer_depth['S1'], embed_dims=EfficientFormer_width['S1'], downsamples=[True, True, True, True], vit_num=2, drop_path_rate=0.0, e_ratios=expansion_ratios_S1, **kwargs) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def efficientformerv2_s2(weights='', **kwargs): model = EfficientFormerV2( layers=EfficientFormer_depth['S2'], embed_dims=EfficientFormer_width['S2'], downsamples=[True, True, True, True], vit_num=4, drop_path_rate=0.02, e_ratios=expansion_ratios_S2, **kwargs) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def efficientformerv2_l(weights='', **kwargs): model = EfficientFormerV2( layers=EfficientFormer_depth['L'], embed_dims=EfficientFormer_width['L'], downsamples=[True, True, True, True], vit_num=6, drop_path_rate=0.1, e_ratios=expansion_ratios_L, **kwargs) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model if __name__ == '__main__': inputs = torch.randn((1, 3, 640, 640)) model = efficientformerv2_s0('eformer_s0_450.pth') res = model(inputs) for i in res: print(i.size()) model = efficientformerv2_s1('eformer_s1_450.pth') res = model(inputs) for i in res: print(i.size()) model = efficientformerv2_s2('eformer_s2_450.pth') res = model(inputs) for i in res: print(i.size()) model = efficientformerv2_l('eformer_l_450.pth') res = model(inputs) for i in res: print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/EfficientViT/efficientViT.py ================================================ from typing import Dict, List, Tuple, Union, Optional, Type, Callable, Any from inspect import signature import torch import torch.nn as nn import torch.nn.functional as F import numpy as np __all__ = [ "efficientvit_b0", "efficientvit_b1", "efficientvit_b2", "efficientvit_b3", ] ################################################################################# # Basic Layers # ################################################################################# def build_kwargs_from_config(config: Dict, target_func: Callable) -> Dict[str, Any]: valid_keys = list(signature(target_func).parameters) kwargs = {} for key in config: if key in valid_keys: kwargs[key] = config[key] return kwargs REGISTERED_NORM_DICT: Dict[str, Type] = { "bn2d": nn.BatchNorm2d, "ln": nn.LayerNorm, } def build_norm(name="bn2d", num_features=None, **kwargs) -> Optional[nn.Module]: if name == "ln": kwargs["normalized_shape"] = num_features else: kwargs["num_features"] = num_features if name in REGISTERED_NORM_DICT: norm_cls = REGISTERED_NORM_DICT[name] args = build_kwargs_from_config(kwargs, norm_cls) return norm_cls(**args) else: return None REGISTERED_ACT_DICT: Dict[str, Type] = { "relu": nn.ReLU, "relu6": nn.ReLU6, "hswish": nn.Hardswish, } def build_act(name: str, **kwargs) -> Optional[nn.Module]: if name in REGISTERED_ACT_DICT: act_cls = REGISTERED_ACT_DICT[name] args = build_kwargs_from_config(kwargs, act_cls) return act_cls(**args) else: return None def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]: if isinstance(kernel_size, tuple): return tuple([get_same_padding(ks) for ks in kernel_size]) else: assert kernel_size % 2 > 0, "kernel size should be odd number" return kernel_size // 2 def list_sum(x: List) -> Any: return x[0] if len(x) == 1 else x[0] + list_sum(x[1:]) def merge_tensor(x: List[torch.Tensor], mode="cat", dim=1) -> torch.Tensor: if mode == "cat": return torch.cat(x, dim=dim) elif mode == "add": return list_sum(x) else: raise NotImplementedError def resize( x: torch.Tensor, size: Optional[Any] = None, scale_factor: Optional[List[float]] = None, mode: str = "bicubic", align_corners: Optional[bool] = False, ) -> torch.Tensor: if mode in {"bilinear", "bicubic"}: return F.interpolate( x, size=size, scale_factor=scale_factor, mode=mode, align_corners=align_corners, ) elif mode in {"nearest", "area"}: return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode) else: raise NotImplementedError(f"resize(mode={mode}) not implemented.") def val2list(x: Union[List, Tuple, Any], repeat_time=1) -> List: if isinstance(x, (list, tuple)): return list(x) return [x for _ in range(repeat_time)] def val2tuple(x: Union[List, Tuple, Any], min_len: int = 1, idx_repeat: int = -1) -> Tuple: # convert to list first x = val2list(x) # repeat elements if necessary if len(x) > 0: x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))] return tuple(x) class ConvLayer(nn.Module): def __init__( self, in_channels: int, out_channels: int, kernel_size=3, stride=1, dilation=1, groups=1, use_bias=False, dropout_rate=0, norm="bn2d", act_func="relu", ): super(ConvLayer, self).__init__() padding = get_same_padding(kernel_size) padding *= dilation self.dropout = nn.Dropout2d(dropout_rate, inplace=False) if dropout_rate > 0 else None self.conv = nn.Conv2d( in_channels, out_channels, kernel_size=(kernel_size, kernel_size), stride=(stride, stride), padding=padding, dilation=(dilation, dilation), groups=groups, bias=use_bias, ) self.norm = build_norm(norm, num_features=out_channels) self.act = build_act(act_func) def forward(self, x: torch.Tensor) -> torch.Tensor: if self.dropout is not None: x = self.dropout(x) x = self.conv(x) if self.norm: x = self.norm(x) if self.act: x = self.act(x) return x class UpSampleLayer(nn.Module): def __init__( self, mode="bicubic", size: Union[int, Tuple[int, int], List[int], None] = None, factor=2, align_corners=False, ): super(UpSampleLayer, self).__init__() self.mode = mode self.size = val2list(size, 2) if size is not None else None self.factor = None if self.size is not None else factor self.align_corners = align_corners def forward(self, x: torch.Tensor) -> torch.Tensor: return resize(x, self.size, self.factor, self.mode, self.align_corners) class LinearLayer(nn.Module): def __init__( self, in_features: int, out_features: int, use_bias=True, dropout_rate=0, norm=None, act_func=None, ): super(LinearLayer, self).__init__() self.dropout = nn.Dropout(dropout_rate, inplace=False) if dropout_rate > 0 else None self.linear = nn.Linear(in_features, out_features, use_bias) self.norm = build_norm(norm, num_features=out_features) self.act = build_act(act_func) def _try_squeeze(self, x: torch.Tensor) -> torch.Tensor: if x.dim() > 2: x = torch.flatten(x, start_dim=1) return x def forward(self, x: torch.Tensor) -> torch.Tensor: x = self._try_squeeze(x) if self.dropout: x = self.dropout(x) x = self.linear(x) if self.norm: x = self.norm(x) if self.act: x = self.act(x) return x class IdentityLayer(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: return x ################################################################################# # Basic Blocks # ################################################################################# class DSConv(nn.Module): def __init__( self, in_channels: int, out_channels: int, kernel_size=3, stride=1, use_bias=False, norm=("bn2d", "bn2d"), act_func=("relu6", None), ): super(DSConv, self).__init__() use_bias = val2tuple(use_bias, 2) norm = val2tuple(norm, 2) act_func = val2tuple(act_func, 2) self.depth_conv = ConvLayer( in_channels, in_channels, kernel_size, stride, groups=in_channels, norm=norm[0], act_func=act_func[0], use_bias=use_bias[0], ) self.point_conv = ConvLayer( in_channels, out_channels, 1, norm=norm[1], act_func=act_func[1], use_bias=use_bias[1], ) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.depth_conv(x) x = self.point_conv(x) return x class MBConv(nn.Module): def __init__( self, in_channels: int, out_channels: int, kernel_size=3, stride=1, mid_channels=None, expand_ratio=6, use_bias=False, norm=("bn2d", "bn2d", "bn2d"), act_func=("relu6", "relu6", None), ): super(MBConv, self).__init__() use_bias = val2tuple(use_bias, 3) norm = val2tuple(norm, 3) act_func = val2tuple(act_func, 3) mid_channels = mid_channels or round(in_channels * expand_ratio) self.inverted_conv = ConvLayer( in_channels, mid_channels, 1, stride=1, norm=norm[0], act_func=act_func[0], use_bias=use_bias[0], ) self.depth_conv = ConvLayer( mid_channels, mid_channels, kernel_size, stride=stride, groups=mid_channels, norm=norm[1], act_func=act_func[1], use_bias=use_bias[1], ) self.point_conv = ConvLayer( mid_channels, out_channels, 1, norm=norm[2], act_func=act_func[2], use_bias=use_bias[2], ) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.inverted_conv(x) x = self.depth_conv(x) x = self.point_conv(x) return x class LiteMSA(nn.Module): r""" Lightweight multi-scale attention """ def __init__( self, in_channels: int, out_channels: int, heads: Optional[int] = None, heads_ratio: float = 1.0, dim=8, use_bias=False, norm=(None, "bn2d"), act_func=(None, None), kernel_func="relu", scales: Tuple[int, ...] = (5,), ): super(LiteMSA, self).__init__() heads = heads or int(in_channels // dim * heads_ratio) total_dim = heads * dim use_bias = val2tuple(use_bias, 2) norm = val2tuple(norm, 2) act_func = val2tuple(act_func, 2) self.dim = dim self.qkv = ConvLayer( in_channels, 3 * total_dim, 1, use_bias=use_bias[0], norm=norm[0], act_func=act_func[0], ) self.aggreg = nn.ModuleList( [ nn.Sequential( nn.Conv2d( 3 * total_dim, 3 * total_dim, scale, padding=get_same_padding(scale), groups=3 * total_dim, bias=use_bias[0], ), nn.Conv2d(3 * total_dim, 3 * total_dim, 1, groups=3 * heads, bias=use_bias[0]), ) for scale in scales ] ) self.kernel_func = build_act(kernel_func, inplace=False) self.proj = ConvLayer( total_dim * (1 + len(scales)), out_channels, 1, use_bias=use_bias[1], norm=norm[1], act_func=act_func[1], ) def forward(self, x: torch.Tensor) -> torch.Tensor: B, _, H, W = list(x.size()) # generate multi-scale q, k, v qkv = self.qkv(x) multi_scale_qkv = [qkv] for op in self.aggreg: multi_scale_qkv.append(op(qkv)) multi_scale_qkv = torch.cat(multi_scale_qkv, dim=1) multi_scale_qkv = torch.reshape( multi_scale_qkv, ( B, -1, 3 * self.dim, H * W, ), ) multi_scale_qkv = torch.transpose(multi_scale_qkv, -1, -2) q, k, v = ( multi_scale_qkv[..., 0 : self.dim].clone(), multi_scale_qkv[..., self.dim : 2 * self.dim].clone(), multi_scale_qkv[..., 2 * self.dim :].clone(), ) # lightweight global attention q = self.kernel_func(q) k = self.kernel_func(k) trans_k = k.transpose(-1, -2) v = F.pad(v, (0, 1), mode="constant", value=1) kv = torch.matmul(trans_k, v) out = torch.matmul(q, kv) out = out[..., :-1] / (out[..., -1:] + 1e-15) # final projecttion out = torch.transpose(out, -1, -2) out = torch.reshape(out, (B, -1, H, W)) out = self.proj(out) return out class EfficientViTBlock(nn.Module): def __init__(self, in_channels: int, heads_ratio: float = 1.0, dim=32, expand_ratio: float = 4, norm="bn2d", act_func="hswish"): super(EfficientViTBlock, self).__init__() self.context_module = ResidualBlock( LiteMSA( in_channels=in_channels, out_channels=in_channels, heads_ratio=heads_ratio, dim=dim, norm=(None, norm), ), IdentityLayer(), ) local_module = MBConv( in_channels=in_channels, out_channels=in_channels, expand_ratio=expand_ratio, use_bias=(True, True, False), norm=(None, None, norm), act_func=(act_func, act_func, None), ) self.local_module = ResidualBlock(local_module, IdentityLayer()) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.context_module(x) x = self.local_module(x) return x ################################################################################# # Functional Blocks # ################################################################################# class ResidualBlock(nn.Module): def __init__( self, main: Optional[nn.Module], shortcut: Optional[nn.Module], post_act=None, pre_norm: Optional[nn.Module] = None, ): super(ResidualBlock, self).__init__() self.pre_norm = pre_norm self.main = main self.shortcut = shortcut self.post_act = build_act(post_act) def forward_main(self, x: torch.Tensor) -> torch.Tensor: if self.pre_norm is None: return self.main(x) else: return self.main(self.pre_norm(x)) def forward(self, x: torch.Tensor) -> torch.Tensor: if self.main is None: res = x elif self.shortcut is None: res = self.forward_main(x) else: res = self.forward_main(x) + self.shortcut(x) if self.post_act: res = self.post_act(res) return res class DAGBlock(nn.Module): def __init__( self, inputs: Dict[str, nn.Module], merge_mode: str, post_input: Optional[nn.Module], middle: nn.Module, outputs: Dict[str, nn.Module], ): super(DAGBlock, self).__init__() self.input_keys = list(inputs.keys()) self.input_ops = nn.ModuleList(list(inputs.values())) self.merge_mode = merge_mode self.post_input = post_input self.middle = middle self.output_keys = list(outputs.keys()) self.output_ops = nn.ModuleList(list(outputs.values())) def forward(self, feature_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: feat = [op(feature_dict[key]) for key, op in zip(self.input_keys, self.input_ops)] feat = merge_tensor(feat, self.merge_mode, dim=1) if self.post_input is not None: feat = self.post_input(feat) feat = self.middle(feat) for key, op in zip(self.output_keys, self.output_ops): feature_dict[key] = op(feat) return feature_dict class OpSequential(nn.Module): def __init__(self, op_list: List[Optional[nn.Module]]): super(OpSequential, self).__init__() valid_op_list = [] for op in op_list: if op is not None: valid_op_list.append(op) self.op_list = nn.ModuleList(valid_op_list) def forward(self, x: torch.Tensor) -> torch.Tensor: for op in self.op_list: x = op(x) return x class EfficientViTBackbone(nn.Module): def __init__(self, width_list: List[int], depth_list: List[int], in_channels=3, dim=32, expand_ratio=4, norm="bn2d", act_func="hswish") -> None: super().__init__() self.width_list = [] # input stem self.input_stem = [ ConvLayer( in_channels=3, out_channels=width_list[0], stride=2, norm=norm, act_func=act_func, ) ] for _ in range(depth_list[0]): block = self.build_local_block( in_channels=width_list[0], out_channels=width_list[0], stride=1, expand_ratio=1, norm=norm, act_func=act_func, ) self.input_stem.append(ResidualBlock(block, IdentityLayer())) in_channels = width_list[0] self.input_stem = OpSequential(self.input_stem) self.width_list.append(in_channels) # stages self.stages = [] for w, d in zip(width_list[1:3], depth_list[1:3]): stage = [] for i in range(d): stride = 2 if i == 0 else 1 block = self.build_local_block( in_channels=in_channels, out_channels=w, stride=stride, expand_ratio=expand_ratio, norm=norm, act_func=act_func, ) block = ResidualBlock(block, IdentityLayer() if stride == 1 else None) stage.append(block) in_channels = w self.stages.append(OpSequential(stage)) self.width_list.append(in_channels) for w, d in zip(width_list[3:], depth_list[3:]): stage = [] block = self.build_local_block( in_channels=in_channels, out_channels=w, stride=2, expand_ratio=expand_ratio, norm=norm, act_func=act_func, fewer_norm=True, ) stage.append(ResidualBlock(block, None)) in_channels = w for _ in range(d): stage.append( EfficientViTBlock( in_channels=in_channels, dim=dim, expand_ratio=expand_ratio, norm=norm, act_func=act_func, ) ) self.stages.append(OpSequential(stage)) self.width_list.append(in_channels) self.stages = nn.ModuleList(self.stages) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 224, 224))] @staticmethod def build_local_block(in_channels: int, out_channels: int, stride: int, expand_ratio: float, norm: str, act_func: str, fewer_norm: bool = False) -> nn.Module: if expand_ratio == 1: block = DSConv( in_channels=in_channels, out_channels=out_channels, stride=stride, use_bias=(True, False) if fewer_norm else False, norm=(None, norm) if fewer_norm else norm, act_func=(act_func, None), ) else: block = MBConv( in_channels=in_channels, out_channels=out_channels, stride=stride, expand_ratio=expand_ratio, use_bias=(True, True, False) if fewer_norm else False, norm=(None, None, norm) if fewer_norm else norm, act_func=(act_func, act_func, None), ) return block def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]: res = [] x = self.input_stem(x) res.append(x) for stage_id, stage in enumerate(self.stages, 1): x = stage(x) res.append(x) return res def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): k = k[9:] if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def efficientvit_b0(weights='', **kwargs) -> EfficientViTBackbone: backbone = EfficientViTBackbone( width_list=[8, 16, 32, 64, 128], depth_list=[1, 2, 2, 2, 2], dim=16, **build_kwargs_from_config(kwargs, EfficientViTBackbone), ) if weights: backbone.load_state_dict(update_weight(backbone.state_dict(), torch.load(weights)['state_dict'])) return backbone def efficientvit_b1(weights='', **kwargs) -> EfficientViTBackbone: backbone = EfficientViTBackbone( width_list=[16, 32, 64, 128, 256], depth_list=[1, 2, 3, 3, 4], dim=16, **build_kwargs_from_config(kwargs, EfficientViTBackbone), ) if weights: backbone.load_state_dict(update_weight(backbone.state_dict(), torch.load(weights)['state_dict'])) return backbone def efficientvit_b2(weights='', **kwargs) -> EfficientViTBackbone: backbone = EfficientViTBackbone( width_list=[24, 48, 96, 192, 384], depth_list=[1, 3, 4, 4, 6], dim=32, **build_kwargs_from_config(kwargs, EfficientViTBackbone), ) if weights: backbone.load_state_dict(update_weight(backbone.state_dict(), torch.load(weights)['state_dict'])) return backbone def efficientvit_b3(weights='', **kwargs) -> EfficientViTBackbone: backbone = EfficientViTBackbone( width_list=[32, 64, 128, 256, 512], depth_list=[1, 4, 6, 6, 9], dim=32, **build_kwargs_from_config(kwargs, EfficientViTBackbone), ) if weights: backbone.load_state_dict(update_weight(backbone.state_dict(), torch.load(weights)['state_dict'])) return backbone if __name__ == '__main__': model = efficientvit_b1() weights = torch.load('b1-r288.pt')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), weights)) inputs = torch.randn((1, 3, 640, 640)) res = model(inputs) for i in res: print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/FocalNet/FocalNet.py ================================================ # -------------------------------------------------------- # FocalNets -- Focal Modulation Networks # Copyright (c) 2022 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Jianwei Yang (jianwyan@microsoft.com) # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ __all__ = ['focalnet_tiny_srf', 'focalnet_tiny_lrf', 'focalnet_small_srf', 'focalnet_small_lrf', 'focalnet_base_srf', 'focalnet_base_lrf', 'focalnet_large_fl3', 'focalnet_large_fl4', 'focalnet_xlarge_fl3', 'focalnet_xlarge_fl4', 'focalnet_huge_fl3', 'focalnet_huge_fl4'] def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class FocalModulation(nn.Module): def __init__(self, dim, focal_window, focal_level, focal_factor=2, bias=True, proj_drop=0., use_postln_in_modulation=False, normalize_modulator=False): super().__init__() self.dim = dim self.focal_window = focal_window self.focal_level = focal_level self.focal_factor = focal_factor self.use_postln_in_modulation = use_postln_in_modulation self.normalize_modulator = normalize_modulator self.f = nn.Linear(dim, 2*dim + (self.focal_level+1), bias=bias) self.h = nn.Conv2d(dim, dim, kernel_size=1, stride=1, bias=bias) self.act = nn.GELU() self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.focal_layers = nn.ModuleList() self.kernel_sizes = [] for k in range(self.focal_level): kernel_size = self.focal_factor*k + self.focal_window self.focal_layers.append( nn.Sequential( nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, groups=dim, padding=kernel_size//2, bias=False), nn.GELU(), ) ) self.kernel_sizes.append(kernel_size) if self.use_postln_in_modulation: self.ln = nn.LayerNorm(dim) def forward(self, x): """ Args: x: input features with shape of (B, H, W, C) """ C = x.shape[-1] # pre linear projection x = self.f(x).permute(0, 3, 1, 2).contiguous() q, ctx, gates = torch.split(x, (C, C, self.focal_level+1), 1) # context aggreation ctx_all = 0 for l in range(self.focal_level): ctx = self.focal_layers[l](ctx) ctx_all = ctx_all + ctx * gates[:, l:l+1] ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True)) ctx_all = ctx_all + ctx_global * gates[:,self.focal_level:] # normalize context if self.normalize_modulator: ctx_all = ctx_all / (self.focal_level+1) # focal modulation modulator = self.h(ctx_all) x_out = q * modulator x_out = x_out.permute(0, 2, 3, 1).contiguous() if self.use_postln_in_modulation: x_out = self.ln(x_out) # post linear porjection x_out = self.proj(x_out) x_out = self.proj_drop(x_out) return x_out def extra_repr(self) -> str: return f'dim={self.dim}' def flops(self, N): # calculate flops for 1 window with token length of N flops = 0 flops += N * self.dim * (self.dim * 2 + (self.focal_level+1)) # focal convolution for k in range(self.focal_level): flops += N * (self.kernel_sizes[k]**2+1) * self.dim # global gating flops += N * 1 * self.dim # self.linear flops += N * self.dim * (self.dim + 1) # x = self.proj(x) flops += N * self.dim * self.dim return flops class FocalNetBlock(nn.Module): r""" Focal Modulation Network Block. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resulotion. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. drop (float, optional): Dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm focal_level (int): Number of focal levels. focal_window (int): Focal window size at first focal level use_layerscale (bool): Whether use layerscale layerscale_value (float): Initial layerscale value use_postln (bool): Whether use layernorm after modulation """ def __init__(self, dim, input_resolution, mlp_ratio=4., drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, focal_level=1, focal_window=3, use_layerscale=False, layerscale_value=1e-4, use_postln=False, use_postln_in_modulation=False, normalize_modulator=False): super().__init__() self.dim = dim self.input_resolution = input_resolution self.mlp_ratio = mlp_ratio self.focal_window = focal_window self.focal_level = focal_level self.use_postln = use_postln self.norm1 = norm_layer(dim) self.modulation = FocalModulation( dim, proj_drop=drop, focal_window=focal_window, focal_level=self.focal_level, use_postln_in_modulation=use_postln_in_modulation, normalize_modulator=normalize_modulator ) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.gamma_1 = 1.0 self.gamma_2 = 1.0 if use_layerscale: self.gamma_1 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True) self.gamma_2 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True) self.H = None self.W = None def forward(self, x): H, W = self.H, self.W B, L, C = x.shape shortcut = x # Focal Modulation x = x if self.use_postln else self.norm1(x) x = x.view(B, H, W, C) x = self.modulation(x).view(B, H * W, C) x = x if not self.use_postln else self.norm1(x) # FFN x = shortcut + self.drop_path(self.gamma_1 * x) x = x + self.drop_path(self.gamma_2 * (self.norm2(self.mlp(x)) if self.use_postln else self.mlp(self.norm2(x)))) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, " \ f"mlp_ratio={self.mlp_ratio}" def flops(self): flops = 0 H, W = self.input_resolution # norm1 flops += self.dim * H * W # W-MSA/SW-MSA flops += self.modulation.flops(H*W) # mlp flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio # norm2 flops += self.dim * H * W return flops class BasicLayer(nn.Module): """ A basic Focal Transformer layer for one stage. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resolution. depth (int): Number of blocks. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. focal_level (int): Number of focal levels focal_window (int): Focal window size at first focal level use_layerscale (bool): Whether use layerscale layerscale_value (float): Initial layerscale value use_postln (bool): Whether use layernorm after modulation """ def __init__(self, dim, out_dim, input_resolution, depth, mlp_ratio=4., drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, focal_level=1, focal_window=1, use_conv_embed=False, use_layerscale=False, layerscale_value=1e-4, use_postln=False, use_postln_in_modulation=False, normalize_modulator=False): super().__init__() self.dim = dim self.input_resolution = input_resolution self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList([ FocalNetBlock( dim=dim, input_resolution=input_resolution, mlp_ratio=mlp_ratio, drop=drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, focal_level=focal_level, focal_window=focal_window, use_layerscale=use_layerscale, layerscale_value=layerscale_value, use_postln=use_postln, use_postln_in_modulation=use_postln_in_modulation, normalize_modulator=normalize_modulator, ) for i in range(depth)]) if downsample is not None: self.downsample = downsample( img_size=input_resolution, patch_size=2, in_chans=dim, embed_dim=out_dim, use_conv_embed=use_conv_embed, norm_layer=norm_layer, is_stem=False ) else: self.downsample = None def forward(self, x, H, W): for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x) else: x = blk(x) if self.downsample is not None: x = x.transpose(1, 2).reshape(x.shape[0], -1, H, W) x, Ho, Wo = self.downsample(x) else: Ho, Wo = H, W return x, Ho, Wo def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" def flops(self): flops = 0 for blk in self.blocks: flops += blk.flops() if self.downsample is not None: flops += self.downsample.flops() return flops class PatchEmbed(nn.Module): r""" Image to Patch Embedding Args: img_size (int): Image size. Default: 224. patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, img_size=(224, 224), patch_size=4, in_chans=3, embed_dim=96, use_conv_embed=False, norm_layer=None, is_stem=False): super().__init__() patch_size = to_2tuple(patch_size) patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] self.img_size = img_size self.patch_size = patch_size self.patches_resolution = patches_resolution self.num_patches = patches_resolution[0] * patches_resolution[1] self.in_chans = in_chans self.embed_dim = embed_dim if use_conv_embed: # if we choose to use conv embedding, then we treat the stem and non-stem differently if is_stem: kernel_size = 7; padding = 2; stride = 4 else: kernel_size = 3; padding = 1; stride = 2 self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) else: self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): B, C, H, W = x.shape x = self.proj(x) H, W = x.shape[2:] x = x.flatten(2).transpose(1, 2) # B Ph*Pw C if self.norm is not None: x = self.norm(x) return x, H, W def flops(self): Ho, Wo = self.patches_resolution flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) if self.norm is not None: flops += Ho * Wo * self.embed_dim return flops class FocalNet(nn.Module): r""" Focal Modulation Networks (FocalNets) Args: img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Focal Transformer layer. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 drop_rate (float): Dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. patch_norm (bool): If True, add normalization after patch embedding. Default: True use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False focal_levels (list): How many focal levels at all stages. Note that this excludes the finest-grain level. Default: [1, 1, 1, 1] focal_windows (list): The focal window size at all stages. Default: [7, 5, 3, 1] use_conv_embed (bool): Whether use convolutional embedding. We noted that using convolutional embedding usually improve the performance, but we do not use it by default. Default: False use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False layerscale_value (float): Value for layer scale. Default: 1e-4 use_postln (bool): Whether use layernorm after modulation (it helps stablize training of large models) """ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], mlp_ratio=4., drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, patch_norm=True, use_checkpoint=False, focal_levels=[2, 2, 2, 2], focal_windows=[3, 3, 3, 3], use_conv_embed=False, use_layerscale=False, layerscale_value=1e-4, use_postln=False, use_postln_in_modulation=False, normalize_modulator=False, **kwargs): super().__init__() self.num_layers = len(depths) embed_dim = [embed_dim * (2 ** i) for i in range(self.num_layers)] self.num_classes = num_classes self.embed_dim = embed_dim self.patch_norm = patch_norm self.num_features = embed_dim[-1] self.mlp_ratio = mlp_ratio # split image into patches using either non-overlapped embedding or overlapped embedding self.patch_embed = PatchEmbed( img_size=to_2tuple(img_size), patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim[0], use_conv_embed=use_conv_embed, norm_layer=norm_layer if self.patch_norm else None, is_stem=True) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer(dim=embed_dim[i_layer], out_dim=embed_dim[i_layer+1] if (i_layer < self.num_layers - 1) else None, input_resolution=(patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer)), depth=depths[i_layer], mlp_ratio=self.mlp_ratio, drop=drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchEmbed if (i_layer < self.num_layers - 1) else None, focal_level=focal_levels[i_layer], focal_window=focal_windows[i_layer], use_conv_embed=use_conv_embed, use_checkpoint=use_checkpoint, use_layerscale=use_layerscale, layerscale_value=layerscale_value, use_postln=use_postln, use_postln_in_modulation=use_postln_in_modulation, normalize_modulator=normalize_modulator ) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.apply(self._init_weights) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {''} @torch.jit.ignore def no_weight_decay_keywords(self): return {''} def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] x, H, W = self.patch_embed(x) x = self.pos_drop(x) features = [x, None, None, None] for layer in self.layers: x, H, W = layer(x, H, W) if input_size // H in scale: features[scale.index(input_size // H)] = x # features[-1] = self.norm(features[-1]) # B L C for i in range(len(features)): features[i] = torch.transpose(features[i], dim0=2, dim1=1).view(-1,features[i].size(2), int(features[i].size(1) ** 0.5), int(features[i].size(1) ** 0.5)) return features def flops(self): flops = 0 flops += self.patch_embed.flops() for i, layer in enumerate(self.layers): flops += layer.flops() flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) flops += self.num_features * self.num_classes return flops model_urls = { "focalnet_tiny_srf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth", "focalnet_tiny_lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth", "focalnet_small_srf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth", "focalnet_small_lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth", "focalnet_base_srf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth", "focalnet_base_lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth", "focalnet_large_fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth", "focalnet_large_fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth", "focalnet_xlarge_fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth", "focalnet_xlarge_fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth", "focalnet_huge_fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_huge_lrf_224.pth", "focalnet_huge_fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_huge_lrf_224_fl4.pth", } def focalnet_tiny_srf(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 6, 2], embed_dim=96, **kwargs) if pretrained: url = model_urls['focalnet_tiny_srf'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_small_srf(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=96, **kwargs) if pretrained: url = model_urls['focalnet_small_srf'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_base_srf(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=128, **kwargs) if pretrained: url = model_urls['focalnet_base_srf'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_tiny_lrf(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 6, 2], embed_dim=96, **kwargs) if pretrained: url = model_urls['focalnet_tiny_lrf'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_small_lrf(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=96, **kwargs) if pretrained: url = model_urls['focalnet_small_lrf'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_base_lrf(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=128, **kwargs) if pretrained: url = model_urls['focalnet_base_lrf'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_tiny_iso(pretrained=False, **kwargs): model = FocalNet(depths=[12], patch_size=16, embed_dim=192, **kwargs) if pretrained: url = model_urls['focalnet_tiny_iso'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_small_iso(pretrained=False, **kwargs): model = FocalNet(depths=[12], patch_size=16, embed_dim=384, **kwargs) if pretrained: url = model_urls['focalnet_small_iso'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_base_iso(pretrained=False, **kwargs): model = FocalNet(depths=[12], patch_size=16, embed_dim=768, focal_levels=[3], focal_windows=[3], use_layerscale=True, use_postln=True, **kwargs) if pretrained: url = model_urls['focalnet_base_iso'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model # FocalNet large+ models def focalnet_large_fl3(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=192, **kwargs) if pretrained: url = model_urls['focalnet_large_fl3'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_large_fl4(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=192, **kwargs) if pretrained: url = model_urls['focalnet_large_fl4'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_xlarge_fl3(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=256, **kwargs) if pretrained: url = model_urls['focalnet_xlarge_fl3'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_xlarge_fl4(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=256, **kwargs) if pretrained: url = model_urls['focalnet_xlarge_fl4'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_huge_fl3(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=352, **kwargs) if pretrained: url = model_urls['focalnet_huge_fl3'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model def focalnet_huge_fl4(pretrained=False, **kwargs): model = FocalNet(depths=[2, 2, 18, 2], embed_dim=352, **kwargs) if pretrained: url = model_urls['focalnet_huge_fl4'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") model.load_state_dict(update_weight(model.state_dict(), checkpoint["model"])) return model if __name__ == '__main__': from copy import deepcopy img_size = 640 x = torch.rand(16, 3, img_size, img_size).cuda() model = focalnet_tiny_srf(pretrained=True).cuda() # model_copy = deepcopy(model) for i in model(x): print(i.size()) flops = model.flops() print(f"number of GFLOPs: {flops / 1e9}") n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"number of params: {n_parameters}") print(list(model_urls.keys())) ================================================ FILE: yolo-improve/yolov5-backbone/LSKNet/lsknet.py ================================================ import torch import torch.nn as nn from torch.nn.modules.utils import _pair as to_2tuple from timm.layers import DropPath, to_2tuple from functools import partial import numpy as np __all__ = 'lsknet_t', 'lsknet_s' class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.dwconv = DWConv(hidden_features) self.act = act_layer() self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.dwconv(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class LSKblock(nn.Module): def __init__(self, dim): super().__init__() self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) self.conv_spatial = nn.Conv2d(dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3) self.conv1 = nn.Conv2d(dim, dim//2, 1) self.conv2 = nn.Conv2d(dim, dim//2, 1) self.conv_squeeze = nn.Conv2d(2, 2, 7, padding=3) self.conv = nn.Conv2d(dim//2, dim, 1) def forward(self, x): attn1 = self.conv0(x) attn2 = self.conv_spatial(attn1) attn1 = self.conv1(attn1) attn2 = self.conv2(attn2) attn = torch.cat([attn1, attn2], dim=1) avg_attn = torch.mean(attn, dim=1, keepdim=True) max_attn, _ = torch.max(attn, dim=1, keepdim=True) agg = torch.cat([avg_attn, max_attn], dim=1) sig = self.conv_squeeze(agg).sigmoid() attn = attn1 * sig[:,0,:,:].unsqueeze(1) + attn2 * sig[:,1,:,:].unsqueeze(1) attn = self.conv(attn) return x * attn class Attention(nn.Module): def __init__(self, d_model): super().__init__() self.proj_1 = nn.Conv2d(d_model, d_model, 1) self.activation = nn.GELU() self.spatial_gating_unit = LSKblock(d_model) self.proj_2 = nn.Conv2d(d_model, d_model, 1) def forward(self, x): shorcut = x.clone() x = self.proj_1(x) x = self.activation(x) x = self.spatial_gating_unit(x) x = self.proj_2(x) x = x + shorcut return x class Block(nn.Module): def __init__(self, dim, mlp_ratio=4., drop=0.,drop_path=0., act_layer=nn.GELU, norm_cfg=None): super().__init__() self.norm1 = nn.BatchNorm2d(dim) self.norm2 = nn.BatchNorm2d(dim) self.attn = Attention(dim) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) layer_scale_init_value = 1e-2 self.layer_scale_1 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) def forward(self, x): x = x + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.attn(self.norm1(x))) x = x + self.drop_path(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) return x class OverlapPatchEmbed(nn.Module): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768, norm_cfg=None): super().__init__() patch_size = to_2tuple(patch_size) self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=(patch_size[0] // 2, patch_size[1] // 2)) self.norm = nn.BatchNorm2d(embed_dim) def forward(self, x): x = self.proj(x) _, _, H, W = x.shape x = self.norm(x) return x, H, W class LSKNet(nn.Module): def __init__(self, img_size=224, in_chans=3, embed_dims=[64, 128, 256, 512], mlp_ratios=[8, 8, 4, 4], drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], num_stages=4, norm_cfg=None): super().__init__() self.depths = depths self.num_stages = num_stages dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule cur = 0 for i in range(num_stages): patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)), patch_size=7 if i == 0 else 3, stride=4 if i == 0 else 2, in_chans=in_chans if i == 0 else embed_dims[i - 1], embed_dim=embed_dims[i], norm_cfg=norm_cfg) block = nn.ModuleList([Block( dim=embed_dims[i], mlp_ratio=mlp_ratios[i], drop=drop_rate, drop_path=dpr[cur + j],norm_cfg=norm_cfg) for j in range(depths[i])]) norm = norm_layer(embed_dims[i]) cur += depths[i] setattr(self, f"patch_embed{i + 1}", patch_embed) setattr(self, f"block{i + 1}", block) setattr(self, f"norm{i + 1}", norm) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x): B = x.shape[0] outs = [] for i in range(self.num_stages): patch_embed = getattr(self, f"patch_embed{i + 1}") block = getattr(self, f"block{i + 1}") norm = getattr(self, f"norm{i + 1}") x, H, W = patch_embed(x) for blk in block: x = blk(x) x = x.flatten(2).transpose(1, 2) x = norm(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() outs.append(x) return outs class DWConv(nn.Module): def __init__(self, dim=768): super(DWConv, self).__init__() self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) def forward(self, x): x = self.dwconv(x) return x def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def lsknet_t(weights=''): model = LSKNet(embed_dims=[32, 64, 160, 256], depths=[3, 3, 5, 2], drop_rate=0.1, drop_path_rate=0.1) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['state_dict'])) return model def lsknet_s(weights=''): model = LSKNet(embed_dims=[64, 128, 256, 512], depths=[2, 2, 4, 2], drop_rate=0.1, drop_path_rate=0.1) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['state_dict'])) return model if __name__ == '__main__': model = lsknet_t('lsk_t_backbone-2ef8a593.pth') inputs = torch.randn((1, 3, 640, 640)) for i in model(inputs): print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/MobileNetV4/mobilenetv4.py ================================================ from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union import torch import torch.nn as nn __all__ = ['MobileNetV4ConvSmall', 'MobileNetV4ConvMedium', 'MobileNetV4ConvLarge', 'MobileNetV4HybridMedium', 'MobileNetV4HybridLarge'] MNV4ConvSmall_BLOCK_SPECS = { "conv0": { "block_name": "convbn", "num_blocks": 1, "block_specs": [ [3, 32, 3, 2] ] }, "layer1": { "block_name": "convbn", "num_blocks": 2, "block_specs": [ [32, 32, 3, 2], [32, 32, 1, 1] ] }, "layer2": { "block_name": "convbn", "num_blocks": 2, "block_specs": [ [32, 96, 3, 2], [96, 64, 1, 1] ] }, "layer3": { "block_name": "uib", "num_blocks": 6, "block_specs": [ [64, 96, 5, 5, True, 2, 3], [96, 96, 0, 3, True, 1, 2], [96, 96, 0, 3, True, 1, 2], [96, 96, 0, 3, True, 1, 2], [96, 96, 0, 3, True, 1, 2], [96, 96, 3, 0, True, 1, 4], ] }, "layer4": { "block_name": "uib", "num_blocks": 6, "block_specs": [ [96, 128, 3, 3, True, 2, 6], [128, 128, 5, 5, True, 1, 4], [128, 128, 0, 5, True, 1, 4], [128, 128, 0, 5, True, 1, 3], [128, 128, 0, 3, True, 1, 4], [128, 128, 0, 3, True, 1, 4], ] }, "layer5": { "block_name": "convbn", "num_blocks": 2, "block_specs": [ [128, 960, 1, 1], [960, 1280, 1, 1] ] } } MNV4ConvMedium_BLOCK_SPECS = { "conv0": { "block_name": "convbn", "num_blocks": 1, "block_specs": [ [3, 32, 3, 2] ] }, "layer1": { "block_name": "fused_ib", "num_blocks": 1, "block_specs": [ [32, 48, 2, 4.0, True] ] }, "layer2": { "block_name": "uib", "num_blocks": 2, "block_specs": [ [48, 80, 3, 5, True, 2, 4], [80, 80, 3, 3, True, 1, 2] ] }, "layer3": { "block_name": "uib", "num_blocks": 8, "block_specs": [ [80, 160, 3, 5, True, 2, 6], [160, 160, 3, 3, True, 1, 4], [160, 160, 3, 3, True, 1, 4], [160, 160, 3, 5, True, 1, 4], [160, 160, 3, 3, True, 1, 4], [160, 160, 3, 0, True, 1, 4], [160, 160, 0, 0, True, 1, 2], [160, 160, 3, 0, True, 1, 4] ] }, "layer4": { "block_name": "uib", "num_blocks": 11, "block_specs": [ [160, 256, 5, 5, True, 2, 6], [256, 256, 5, 5, True, 1, 4], [256, 256, 3, 5, True, 1, 4], [256, 256, 3, 5, True, 1, 4], [256, 256, 0, 0, True, 1, 4], [256, 256, 3, 0, True, 1, 4], [256, 256, 3, 5, True, 1, 2], [256, 256, 5, 5, True, 1, 4], [256, 256, 0, 0, True, 1, 4], [256, 256, 0, 0, True, 1, 4], [256, 256, 5, 0, True, 1, 2] ] }, "layer5": { "block_name": "convbn", "num_blocks": 2, "block_specs": [ [256, 960, 1, 1], [960, 1280, 1, 1] ] } } MNV4ConvLarge_BLOCK_SPECS = { "conv0": { "block_name": "convbn", "num_blocks": 1, "block_specs": [ [3, 24, 3, 2] ] }, "layer1": { "block_name": "fused_ib", "num_blocks": 1, "block_specs": [ [24, 48, 2, 4.0, True] ] }, "layer2": { "block_name": "uib", "num_blocks": 2, "block_specs": [ [48, 96, 3, 5, True, 2, 4], [96, 96, 3, 3, True, 1, 4] ] }, "layer3": { "block_name": "uib", "num_blocks": 11, "block_specs": [ [96, 192, 3, 5, True, 2, 4], [192, 192, 3, 3, True, 1, 4], [192, 192, 3, 3, True, 1, 4], [192, 192, 3, 3, True, 1, 4], [192, 192, 3, 5, True, 1, 4], [192, 192, 5, 3, True, 1, 4], [192, 192, 5, 3, True, 1, 4], [192, 192, 5, 3, True, 1, 4], [192, 192, 5, 3, True, 1, 4], [192, 192, 5, 3, True, 1, 4], [192, 192, 3, 0, True, 1, 4] ] }, "layer4": { "block_name": "uib", "num_blocks": 13, "block_specs": [ [192, 512, 5, 5, True, 2, 4], [512, 512, 5, 5, True, 1, 4], [512, 512, 5, 5, True, 1, 4], [512, 512, 5, 5, True, 1, 4], [512, 512, 5, 0, True, 1, 4], [512, 512, 5, 3, True, 1, 4], [512, 512, 5, 0, True, 1, 4], [512, 512, 5, 0, True, 1, 4], [512, 512, 5, 3, True, 1, 4], [512, 512, 5, 5, True, 1, 4], [512, 512, 5, 0, True, 1, 4], [512, 512, 5, 0, True, 1, 4], [512, 512, 5, 0, True, 1, 4] ] }, "layer5": { "block_name": "convbn", "num_blocks": 2, "block_specs": [ [512, 960, 1, 1], [960, 1280, 1, 1] ] } } MNV4HybridConvMedium_BLOCK_SPECS = { } MNV4HybridConvLarge_BLOCK_SPECS = { } MODEL_SPECS = { "MobileNetV4ConvSmall": MNV4ConvSmall_BLOCK_SPECS, "MobileNetV4ConvMedium": MNV4ConvMedium_BLOCK_SPECS, "MobileNetV4ConvLarge": MNV4ConvLarge_BLOCK_SPECS, "MobileNetV4HybridMedium": MNV4HybridConvMedium_BLOCK_SPECS, "MobileNetV4HybridLarge": MNV4HybridConvLarge_BLOCK_SPECS, } def make_divisible( value: float, divisor: int, min_value: Optional[float] = None, round_down_protect: bool = True, ) -> int: """ This function is copied from here "https://github.com/tensorflow/models/blob/master/official/vision/modeling/layers/nn_layers.py" This is to ensure that all layers have channels that are divisible by 8. Args: value: A `float` of original value. divisor: An `int` of the divisor that need to be checked upon. min_value: A `float` of minimum value threshold. round_down_protect: A `bool` indicating whether round down more than 10% will be allowed. Returns: The adjusted value in `int` that is divisible against divisor. """ if min_value is None: min_value = divisor new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if round_down_protect and new_value < 0.9 * value: new_value += divisor return int(new_value) def conv_2d(inp, oup, kernel_size=3, stride=1, groups=1, bias=False, norm=True, act=True): conv = nn.Sequential() padding = (kernel_size - 1) // 2 conv.add_module('conv', nn.Conv2d(inp, oup, kernel_size, stride, padding, bias=bias, groups=groups)) if norm: conv.add_module('BatchNorm2d', nn.BatchNorm2d(oup)) if act: conv.add_module('Activation', nn.ReLU6()) return conv class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio, act=False): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2] hidden_dim = int(round(inp * expand_ratio)) self.block = nn.Sequential() if expand_ratio != 1: self.block.add_module('exp_1x1', conv_2d(inp, hidden_dim, kernel_size=1, stride=1)) self.block.add_module('conv_3x3', conv_2d(hidden_dim, hidden_dim, kernel_size=3, stride=stride, groups=hidden_dim)) self.block.add_module('red_1x1', conv_2d(hidden_dim, oup, kernel_size=1, stride=1, act=act)) self.use_res_connect = self.stride == 1 and inp == oup def forward(self, x): if self.use_res_connect: return x + self.block(x) else: return self.block(x) class UniversalInvertedBottleneckBlock(nn.Module): def __init__(self, inp, oup, start_dw_kernel_size, middle_dw_kernel_size, middle_dw_downsample, stride, expand_ratio ): super().__init__() # Starting depthwise conv. self.start_dw_kernel_size = start_dw_kernel_size if self.start_dw_kernel_size: stride_ = stride if not middle_dw_downsample else 1 self._start_dw_ = conv_2d(inp, inp, kernel_size=start_dw_kernel_size, stride=stride_, groups=inp, act=False) # Expansion with 1x1 convs. expand_filters = make_divisible(inp * expand_ratio, 8) self._expand_conv = conv_2d(inp, expand_filters, kernel_size=1) # Middle depthwise conv. self.middle_dw_kernel_size = middle_dw_kernel_size if self.middle_dw_kernel_size: stride_ = stride if middle_dw_downsample else 1 self._middle_dw = conv_2d(expand_filters, expand_filters, kernel_size=middle_dw_kernel_size, stride=stride_, groups=expand_filters) # Projection with 1x1 convs. self._proj_conv = conv_2d(expand_filters, oup, kernel_size=1, stride=1, act=False) # Ending depthwise conv. # this not used # _end_dw_kernel_size = 0 # self._end_dw = conv_2d(oup, oup, kernel_size=_end_dw_kernel_size, stride=stride, groups=inp, act=False) def forward(self, x): if self.start_dw_kernel_size: x = self._start_dw_(x) # print("_start_dw_", x.shape) x = self._expand_conv(x) # print("_expand_conv", x.shape) if self.middle_dw_kernel_size: x = self._middle_dw(x) # print("_middle_dw", x.shape) x = self._proj_conv(x) # print("_proj_conv", x.shape) return x def build_blocks(layer_spec): if not layer_spec.get('block_name'): return nn.Sequential() block_names = layer_spec['block_name'] layers = nn.Sequential() if block_names == "convbn": schema_ = ['inp', 'oup', 'kernel_size', 'stride'] args = {} for i in range(layer_spec['num_blocks']): args = dict(zip(schema_, layer_spec['block_specs'][i])) layers.add_module(f"convbn_{i}", conv_2d(**args)) elif block_names == "uib": schema_ = ['inp', 'oup', 'start_dw_kernel_size', 'middle_dw_kernel_size', 'middle_dw_downsample', 'stride', 'expand_ratio'] args = {} for i in range(layer_spec['num_blocks']): args = dict(zip(schema_, layer_spec['block_specs'][i])) layers.add_module(f"uib_{i}", UniversalInvertedBottleneckBlock(**args)) elif block_names == "fused_ib": schema_ = ['inp', 'oup', 'stride', 'expand_ratio', 'act'] args = {} for i in range(layer_spec['num_blocks']): args = dict(zip(schema_, layer_spec['block_specs'][i])) layers.add_module(f"fused_ib_{i}", InvertedResidual(**args)) else: raise NotImplementedError return layers class MobileNetV4(nn.Module): def __init__(self, model): # MobileNetV4ConvSmall MobileNetV4ConvMedium MobileNetV4ConvLarge # MobileNetV4HybridMedium MobileNetV4HybridLarge """Params to initiate MobilenNetV4 Args: model : support 5 types of models as indicated in "https://github.com/tensorflow/models/blob/master/official/vision/modeling/backbones/mobilenet.py" """ super().__init__() assert model in MODEL_SPECS.keys() self.model = model self.spec = MODEL_SPECS[self.model] # conv0 self.conv0 = build_blocks(self.spec['conv0']) # layer1 self.layer1 = build_blocks(self.spec['layer1']) # layer2 self.layer2 = build_blocks(self.spec['layer2']) # layer3 self.layer3 = build_blocks(self.spec['layer3']) # layer4 self.layer4 = build_blocks(self.spec['layer4']) # layer5 self.layer5 = build_blocks(self.spec['layer5']) self.features = nn.ModuleList([self.conv0, self.layer1, self.layer2, self.layer3, self.layer4, self.layer5]) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] features = [None, None, None, None] for f in self.features: x = f(x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x return features def MobileNetV4ConvSmall(): model = MobileNetV4('MobileNetV4ConvSmall') return model def MobileNetV4ConvMedium(): model = MobileNetV4('MobileNetV4ConvMedium') return model def MobileNetV4ConvLarge(): model = MobileNetV4('MobileNetV4ConvLarge') return model def MobileNetV4HybridMedium(): model = MobileNetV4('MobileNetV4HybridMedium') return model def MobileNetV4HybridLarge(): model = MobileNetV4('MobileNetV4HybridLarge') return model if __name__ == '__main__': model = MobileNetV4ConvSmall() inputs = torch.randn((1, 3, 640, 640)) res = model(inputs) for i in res: print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/NextViT/NextViT.py ================================================ # Copyright (c) ByteDance Inc. All rights reserved. from functools import partial import numpy as np import torch import torch.utils.checkpoint as checkpoint from einops import rearrange from timm.models.layers import DropPath, trunc_normal_ from torch import nn __all__ = ['nextvit_small', 'nextvit_base', 'nextvit_large'] NORM_EPS = 1e-5 class ConvBNReLU(nn.Module): def __init__( self, in_channels, out_channels, kernel_size, stride, groups=1): super(ConvBNReLU, self).__init__() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=1, groups=groups, bias=False) self.norm = nn.BatchNorm2d(out_channels, eps=NORM_EPS) self.act = nn.ReLU(inplace=True) def forward(self, x): x = self.conv(x) x = self.norm(x) x = self.act(x) return x def _make_divisible(v, divisor, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class PatchEmbed(nn.Module): def __init__(self, in_channels, out_channels, stride=1): super(PatchEmbed, self).__init__() norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS) if stride == 2: self.avgpool = nn.AvgPool2d((2, 2), stride=2, ceil_mode=True, count_include_pad=False) self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False) self.norm = norm_layer(out_channels) elif in_channels != out_channels: self.avgpool = nn.Identity() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False) self.norm = norm_layer(out_channels) else: self.avgpool = nn.Identity() self.conv = nn.Identity() self.norm = nn.Identity() def forward(self, x): return self.norm(self.conv(self.avgpool(x))) class MHCA(nn.Module): """ Multi-Head Convolutional Attention """ def __init__(self, out_channels, head_dim): super(MHCA, self).__init__() norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS) self.group_conv3x3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, groups=out_channels // head_dim, bias=False) self.norm = norm_layer(out_channels) self.act = nn.ReLU(inplace=True) self.projection = nn.Conv2d(out_channels, out_channels, kernel_size=1, bias=False) def forward(self, x): out = self.group_conv3x3(x) out = self.norm(out) out = self.act(out) out = self.projection(out) return out class Mlp(nn.Module): def __init__(self, in_features, out_features=None, mlp_ratio=None, drop=0., bias=True): super().__init__() out_features = out_features or in_features hidden_dim = _make_divisible(in_features * mlp_ratio, 32) self.conv1 = nn.Conv2d(in_features, hidden_dim, kernel_size=1, bias=bias) self.act = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(hidden_dim, out_features, kernel_size=1, bias=bias) self.drop = nn.Dropout(drop) def forward(self, x): x = self.conv1(x) x = self.act(x) x = self.drop(x) x = self.conv2(x) x = self.drop(x) return x class NCB(nn.Module): """ Next Convolution Block """ def __init__(self, in_channels, out_channels, stride=1, path_dropout=0, drop=0, head_dim=32, mlp_ratio=3): super(NCB, self).__init__() self.in_channels = in_channels self.out_channels = out_channels norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS) assert out_channels % head_dim == 0 self.patch_embed = PatchEmbed(in_channels, out_channels, stride) self.mhca = MHCA(out_channels, head_dim) self.attention_path_dropout = DropPath(path_dropout) self.norm = norm_layer(out_channels) self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop, bias=True) self.mlp_path_dropout = DropPath(path_dropout) self.is_bn_merged = False def forward(self, x): x = self.patch_embed(x) x = x + self.attention_path_dropout(self.mhca(x)) if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: out = self.norm(x) else: out = x x = x + self.mlp_path_dropout(self.mlp(out)) return x class E_MHSA(nn.Module): """ Efficient Multi-Head Self Attention """ def __init__(self, dim, out_dim=None, head_dim=32, qkv_bias=True, qk_scale=None, attn_drop=0, proj_drop=0., sr_ratio=1): super().__init__() self.dim = dim self.out_dim = out_dim if out_dim is not None else dim self.num_heads = self.dim // head_dim self.scale = qk_scale or head_dim ** -0.5 self.q = nn.Linear(dim, self.dim, bias=qkv_bias) self.k = nn.Linear(dim, self.dim, bias=qkv_bias) self.v = nn.Linear(dim, self.dim, bias=qkv_bias) self.proj = nn.Linear(self.dim, self.out_dim) self.attn_drop = nn.Dropout(attn_drop) self.proj_drop = nn.Dropout(proj_drop) self.sr_ratio = sr_ratio self.N_ratio = sr_ratio ** 2 if sr_ratio > 1: self.sr = nn.AvgPool1d(kernel_size=self.N_ratio, stride=self.N_ratio) self.norm = nn.BatchNorm1d(dim, eps=NORM_EPS) self.is_bn_merged = False def forward(self, x): B, N, C = x.shape q = self.q(x) q = q.reshape(B, N, self.num_heads, int(C // self.num_heads)).permute(0, 2, 1, 3) if self.sr_ratio > 1: x_ = x.transpose(1, 2) x_ = self.sr(x_) if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: x_ = self.norm(x_) x_ = x_.transpose(1, 2) k = self.k(x_) k = k.reshape(B, -1, self.num_heads, int(C // self.num_heads)).permute(0, 2, 3, 1) v = self.v(x_) v = v.reshape(B, -1, self.num_heads, int(C // self.num_heads)).permute(0, 2, 1, 3) else: k = self.k(x) k = k.reshape(B, -1, self.num_heads, int(C // self.num_heads)).permute(0, 2, 3, 1) v = self.v(x) v = v.reshape(B, -1, self.num_heads, int(C // self.num_heads)).permute(0, 2, 1, 3) attn = (q @ k) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class NTB(nn.Module): """ Next Transformer Block """ def __init__( self, in_channels, out_channels, path_dropout, stride=1, sr_ratio=1, mlp_ratio=2, head_dim=32, mix_block_ratio=0.75, attn_drop=0, drop=0, ): super(NTB, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.mix_block_ratio = mix_block_ratio norm_func = partial(nn.BatchNorm2d, eps=NORM_EPS) self.mhsa_out_channels = _make_divisible(int(out_channels * mix_block_ratio), 32) self.mhca_out_channels = out_channels - self.mhsa_out_channels self.patch_embed = PatchEmbed(in_channels, self.mhsa_out_channels, stride) self.norm1 = norm_func(self.mhsa_out_channels) self.e_mhsa = E_MHSA(self.mhsa_out_channels, head_dim=head_dim, sr_ratio=sr_ratio, attn_drop=attn_drop, proj_drop=drop) self.mhsa_path_dropout = DropPath(path_dropout * mix_block_ratio) self.projection = PatchEmbed(self.mhsa_out_channels, self.mhca_out_channels, stride=1) self.mhca = MHCA(self.mhca_out_channels, head_dim=head_dim) self.mhca_path_dropout = DropPath(path_dropout * (1 - mix_block_ratio)) self.norm2 = norm_func(out_channels) self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop) self.mlp_path_dropout = DropPath(path_dropout) self.is_bn_merged = False def forward(self, x): x = self.patch_embed(x) B, C, H, W = x.shape if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: out = self.norm1(x) else: out = x out = rearrange(out, "b c h w -> b (h w) c") # b n c out = self.mhsa_path_dropout(self.e_mhsa(out)) x = x + rearrange(out, "b (h w) c -> b c h w", h=H) out = self.projection(x) out = out + self.mhca_path_dropout(self.mhca(out)) x = torch.cat([x, out], dim=1) if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: out = self.norm2(x) else: out = x x = x + self.mlp_path_dropout(self.mlp(out)) return x class NextViT(nn.Module): def __init__(self, stem_chs, depths, path_dropout, attn_drop=0, drop=0, num_classes=1000, strides=[1, 2, 2, 2], sr_ratios=[8, 4, 2, 1], head_dim=32, mix_block_ratio=0.75, use_checkpoint=False): super(NextViT, self).__init__() self.use_checkpoint = use_checkpoint self.stage_out_channels = [[96] * (depths[0]), [192] * (depths[1] - 1) + [256], [384, 384, 384, 384, 512] * (depths[2] // 5), [768] * (depths[3] - 1) + [1024]] # Next Hybrid Strategy self.stage_block_types = [[NCB] * depths[0], [NCB] * (depths[1] - 1) + [NTB], [NCB, NCB, NCB, NCB, NTB] * (depths[2] // 5), [NCB] * (depths[3] - 1) + [NTB]] self.stem = nn.Sequential( ConvBNReLU(3, stem_chs[0], kernel_size=3, stride=2), ConvBNReLU(stem_chs[0], stem_chs[1], kernel_size=3, stride=1), ConvBNReLU(stem_chs[1], stem_chs[2], kernel_size=3, stride=1), ConvBNReLU(stem_chs[2], stem_chs[2], kernel_size=3, stride=2), ) input_channel = stem_chs[-1] features = [] idx = 0 dpr = [x.item() for x in torch.linspace(0, path_dropout, sum(depths))] # stochastic depth decay rule for stage_id in range(len(depths)): numrepeat = depths[stage_id] output_channels = self.stage_out_channels[stage_id] block_types = self.stage_block_types[stage_id] for block_id in range(numrepeat): if strides[stage_id] == 2 and block_id == 0: stride = 2 else: stride = 1 output_channel = output_channels[block_id] block_type = block_types[block_id] if block_type is NCB: layer = NCB(input_channel, output_channel, stride=stride, path_dropout=dpr[idx + block_id], drop=drop, head_dim=head_dim) features.append(layer) elif block_type is NTB: layer = NTB(input_channel, output_channel, path_dropout=dpr[idx + block_id], stride=stride, sr_ratio=sr_ratios[stage_id], head_dim=head_dim, mix_block_ratio=mix_block_ratio, attn_drop=attn_drop, drop=drop) features.append(layer) input_channel = output_channel idx += numrepeat self.features = nn.Sequential(*features) self.norm = nn.BatchNorm2d(output_channel, eps=NORM_EPS) self.stage_out_idx = [sum(depths[:idx + 1]) - 1 for idx in range(len(depths))] self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] self._initialize_weights() def _initialize_weights(self): for n, m in self.named_modules(): if isinstance(m, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm, nn.BatchNorm1d)): nn.init.constant_(m.weight, 1.0) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if hasattr(m, 'bias') and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Conv2d): trunc_normal_(m.weight, std=.02) if hasattr(m, 'bias') and m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x): res = [] x = self.stem(x) for idx, layer in enumerate(self.features): if self.use_checkpoint: x = checkpoint.checkpoint(layer, x) else: x = layer(x) if idx in self.stage_out_idx: res.append(x) res[-1] = self.norm(res[-1]) return res def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def nextvit_small(weights=''): model = NextViT(stem_chs=[64, 32, 64], depths=[3, 4, 10, 3], path_dropout=0.1) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def nextvit_base(weights=''): model = NextViT(stem_chs=[64, 32, 64], depths=[3, 4, 20, 3], path_dropout=0.2) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model def nextvit_large(weights=''): model = NextViT(stem_chs=[64, 32, 64], depths=[3, 4, 30, 3], path_dropout=0.2) if weights: pretrained_weight = torch.load(weights)['model'] model.load_state_dict(update_weight(model.state_dict(), pretrained_weight)) return model ================================================ FILE: yolo-improve/yolov5-backbone/ODConv/od_mobilenetv2.py ================================================ import torch from torch import nn import numpy as np from models.ODConv.odconv import ODConv2d __all__ = ['od_mobilenetv2_050', 'od_mobilenetv2_075', 'od_mobilenetv2_100'] def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py :param v: :param divisor: :param min_value: :return: """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNReLU(nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=nn.BatchNorm2d): padding = (kernel_size - 1) // 2 super(ConvBNReLU, self).__init__( nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), norm_layer(out_planes), nn.ReLU6(inplace=True) ) class ODConvBNReLU(nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=nn.BatchNorm2d, reduction=0.0625, kernel_num=1): padding = (kernel_size - 1) // 2 super(ODConvBNReLU, self).__init__( ODConv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, reduction=reduction, kernel_num=kernel_num), norm_layer(out_planes), nn.ReLU6(inplace=True) ) class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio, norm_layer=nn.BatchNorm2d, reduction=0.0625, kernel_num=1): super(InvertedResidual, self).__init__() self.stride = stride hidden_dim = int(round(inp * expand_ratio)) self.use_res_connect = self.stride == 1 and inp == oup layers = [] if expand_ratio != 1: # pw layers.append(ODConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num)) layers.extend([ # dw ODConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num), # pw-linear ODConv2d(hidden_dim, oup, 1, 1, 0, reduction=reduction, kernel_num=kernel_num), norm_layer(oup), ]) self.conv = nn.Sequential(*layers) def forward(self, x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) class OD_MobileNetV2(nn.Module): def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8, block=InvertedResidual, norm_layer=nn.BatchNorm2d, dropout=0.2, reduction=0.0625, kernel_num=1, **kwargs): """ MobileNet V2 main class Args: num_classes (int): Number of classes width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount inverted_residual_setting: Network structure round_nearest (int): Round the number of channels in each layer to be a multiple of this number Set to 1 to turn off rounding block: Module specifying inverted residual building block for mobilenet norm_layer: Module specifying the normalization layer to use """ super(OD_MobileNetV2, self).__init__() input_channel = 32 last_channel = 1280 if inverted_residual_setting is None: inverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1], ] # only check the first element, assuming user knows t,c,n,s are required if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: raise ValueError("inverted_residual_setting should be non-empty " "or a 4-element list, got {}".format(inverted_residual_setting)) # building first layer input_channel = _make_divisible(input_channel * width_mult, round_nearest) self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)] # building inverted residual blocks for t, c, n, s in inverted_residual_setting: output_channel = _make_divisible(c * width_mult, round_nearest) for i in range(n): stride = s if i == 0 else 1 features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num)) input_channel = output_channel # building last several layers features.append(ODConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num)) # make it nn.Sequential self.features = nn.Sequential(*features) # weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) self.channel = [i.size(1) for i in self.forward(torch.randn(2, 3, 640, 640))] def net_update_temperature(self, temperature): for m in self.modules(): if hasattr(m, "update_temperature"): m.update_temperature(temperature) def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] features = [None, None, None, None] for idx, layer in enumerate(self.features): x = layer(x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x return features def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k.replace('module.', '') in model_dict.keys() and np.shape(model_dict[k.replace('module.', '')]) == np.shape(v): temp_dict[k.replace('module.', '')] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def od_mobilenetv2_050(weights=None, kernel_num=1): model = OD_MobileNetV2(width_mult=0.5, kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_mobilenetv2_075(weights=None, kernel_num=1): model = OD_MobileNetV2(width_mult=0.75, kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_mobilenetv2_100(weights=None, kernel_num=1): model = OD_MobileNetV2(width_mult=1.0, kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model ================================================ FILE: yolo-improve/yolov5-backbone/ODConv/od_resnet.py ================================================ import torch import torch.nn as nn from models.ODConv.odconv import ODConv2d import numpy as np __all__ = ['od_resnet18', 'od_resnet34', 'od_resnet50', 'od_resnet101'] def odconv3x3(in_planes, out_planes, stride=1, reduction=0.0625, kernel_num=1): return ODConv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, reduction=reduction, kernel_num=kernel_num) def odconv1x1(in_planes, out_planes, stride=1, reduction=0.0625, kernel_num=1): return ODConv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, reduction=reduction, kernel_num=kernel_num) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=0.0625, kernel_num=1): super(BasicBlock, self).__init__() self.conv1 = odconv3x3(inplanes, planes, stride, reduction=reduction, kernel_num=kernel_num) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = odconv3x3(planes, planes, reduction=reduction, kernel_num=kernel_num) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=0.0625, kernel_num=1): super(Bottleneck, self).__init__() self.conv1 = odconv1x1(inplanes, planes, reduction=reduction, kernel_num=kernel_num) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = odconv3x3(planes, planes, stride, reduction=reduction, kernel_num=kernel_num) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = odconv1x1(planes, planes * self.expansion, reduction=reduction, kernel_num=kernel_num) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class OD_ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, dropout=0.1, reduction=0.0625, kernel_num=1): super(OD_ResNet, self).__init__() self.inplanes = 64 self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0], reduction=reduction, kernel_num=kernel_num) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, reduction=reduction, kernel_num=kernel_num) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, reduction=reduction, kernel_num=kernel_num) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, reduction=reduction, kernel_num=kernel_num) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) self.channel = [i.size(1) for i in self.forward(torch.randn(2, 3, 640, 640))] def net_update_temperature(self, temperature): for m in self.modules(): if hasattr(m, "update_temperature"): m.update_temperature(temperature) def _make_layer(self, block, planes, blocks, stride=1, reduction=0.625, kernel_num=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, padding=0, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, reduction=reduction, kernel_num=kernel_num)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes, reduction=reduction, kernel_num=kernel_num)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x1 = self.relu(x) x = self.maxpool(x1) x2 = self.layer1(x) x3 = self.layer2(x2) x4 = self.layer3(x3) x5 = self.layer4(x4) return [x1, x2, x3, x4, x5] def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k.replace('module.', '') in model_dict.keys() and np.shape(model_dict[k.replace('module.', '')]) == np.shape(v): temp_dict[k.replace('module.', '')] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def od_resnet18(weights=None, kernel_num=1): model = OD_ResNet(BasicBlock, [2, 2, 2, 2], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_resnet34(weights=None, kernel_num=1): model = OD_ResNet(BasicBlock, [3, 4, 6, 3], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_resnet50(weights=None, kernel_num=1): model = OD_ResNet(Bottleneck, [3, 4, 6, 3], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_resnet101(weights=None, kernel_num=1): model = OD_ResNet(Bottleneck, [3, 4, 23, 3], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model ================================================ FILE: yolo-improve/yolov5-backbone/ODConv/odconv.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torch.autograd class Attention(nn.Module): def __init__(self, in_planes, out_planes, kernel_size, groups=1, reduction=0.0625, kernel_num=4, min_channel=16): super(Attention, self).__init__() attention_channel = max(int(in_planes * reduction), min_channel) self.kernel_size = kernel_size self.kernel_num = kernel_num self.temperature = 1.0 self.avgpool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Conv2d(in_planes, attention_channel, 1, bias=False) self.bn = nn.BatchNorm2d(attention_channel) self.relu = nn.ReLU(inplace=True) self.channel_fc = nn.Conv2d(attention_channel, in_planes, 1, bias=True) self.func_channel = self.get_channel_attention if in_planes == groups and in_planes == out_planes: # depth-wise convolution self.func_filter = self.skip else: self.filter_fc = nn.Conv2d(attention_channel, out_planes, 1, bias=True) self.func_filter = self.get_filter_attention if kernel_size == 1: # point-wise convolution self.func_spatial = self.skip else: self.spatial_fc = nn.Conv2d(attention_channel, kernel_size * kernel_size, 1, bias=True) self.func_spatial = self.get_spatial_attention if kernel_num == 1: self.func_kernel = self.skip else: self.kernel_fc = nn.Conv2d(attention_channel, kernel_num, 1, bias=True) self.func_kernel = self.get_kernel_attention self._initialize_weights() def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') if m.bias is not None: nn.init.constant_(m.bias, 0) if isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def update_temperature(self, temperature): self.temperature = temperature @staticmethod def skip(_): return 1.0 def get_channel_attention(self, x): channel_attention = torch.sigmoid(self.channel_fc(x).view(x.size(0), -1, 1, 1) / self.temperature) return channel_attention def get_filter_attention(self, x): filter_attention = torch.sigmoid(self.filter_fc(x).view(x.size(0), -1, 1, 1) / self.temperature) return filter_attention def get_spatial_attention(self, x): spatial_attention = self.spatial_fc(x).view(x.size(0), 1, 1, 1, self.kernel_size, self.kernel_size) spatial_attention = torch.sigmoid(spatial_attention / self.temperature) return spatial_attention def get_kernel_attention(self, x): kernel_attention = self.kernel_fc(x).view(x.size(0), -1, 1, 1, 1, 1) kernel_attention = F.softmax(kernel_attention / self.temperature, dim=1) return kernel_attention def forward(self, x): x = self.avgpool(x) x = self.fc(x) x = self.bn(x) x = self.relu(x) return self.func_channel(x), self.func_filter(x), self.func_spatial(x), self.func_kernel(x) class ODConv2d(nn.Module): def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, reduction=0.0625, kernel_num=4): super(ODConv2d, self).__init__() self.in_planes = in_planes self.out_planes = out_planes self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.kernel_num = kernel_num self.attention = Attention(in_planes, out_planes, kernel_size, groups=groups, reduction=reduction, kernel_num=kernel_num) self.weight = nn.Parameter(torch.randn(kernel_num, out_planes, in_planes//groups, kernel_size, kernel_size), requires_grad=True) self._initialize_weights() if self.kernel_size == 1 and self.kernel_num == 1: self._forward_impl = self._forward_impl_pw1x else: self._forward_impl = self._forward_impl_common def _initialize_weights(self): for i in range(self.kernel_num): nn.init.kaiming_normal_(self.weight[i], mode='fan_out', nonlinearity='relu') def update_temperature(self, temperature): self.attention.update_temperature(temperature) def _forward_impl_common(self, x): # Multiplying channel attention (or filter attention) to weights and feature maps are equivalent, # while we observe that when using the latter method the models will run faster with less gpu memory cost. channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x) batch_size, in_planes, height, width = x.size() x = x * channel_attention x = x.reshape(1, -1, height, width) aggregate_weight = spatial_attention * kernel_attention * self.weight.unsqueeze(dim=0) aggregate_weight = torch.sum(aggregate_weight, dim=1).view( [-1, self.in_planes // self.groups, self.kernel_size, self.kernel_size]) output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups * batch_size) output = output.view(batch_size, self.out_planes, output.size(-2), output.size(-1)) output = output * filter_attention return output def _forward_impl_pw1x(self, x): channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x) x = x * channel_attention output = F.conv2d(x, weight=self.weight.squeeze(dim=0), bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups) output = output * filter_attention return output def forward(self, x): return self._forward_impl(x) ================================================ FILE: yolo-improve/yolov5-backbone/ODConvFuse/od_mobilenetv2.py ================================================ import torch from torch import nn import numpy as np from models.ODConv.odconv import ODConv2d __all__ = ['od_mobilenetv2_050', 'od_mobilenetv2_075', 'od_mobilenetv2_100'] def fuse_conv_bn(conv, bn): # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ fusedconv = ( nn.Conv2d( conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, groups=conv.groups, bias=True, ) .requires_grad_(False) .to(conv.weight.device) ) # prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) # prepare spatial bias b_conv = ( torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias ) b_bn = bn.bias - bn.weight.mul(bn.running_mean).div( torch.sqrt(bn.running_var + bn.eps) ) fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py :param v: :param divisor: :param min_value: :return: """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNReLU(nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=nn.BatchNorm2d): padding = (kernel_size - 1) // 2 super(ConvBNReLU, self).__init__( nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), norm_layer(out_planes), nn.ReLU6(inplace=True) ) def fuse(self): self = nn.Sequential( fuse_conv_bn(self[0], self[1]), self[2] ) class ODConvBNReLU(nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=nn.BatchNorm2d, reduction=0.0625, kernel_num=1): padding = (kernel_size - 1) // 2 super(ODConvBNReLU, self).__init__( ODConv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, reduction=reduction, kernel_num=kernel_num), norm_layer(out_planes), nn.ReLU6(inplace=True) ) class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio, norm_layer=nn.BatchNorm2d, reduction=0.0625, kernel_num=1): super(InvertedResidual, self).__init__() self.stride = stride hidden_dim = int(round(inp * expand_ratio)) self.use_res_connect = self.stride == 1 and inp == oup layers = [] if expand_ratio != 1: # pw layers.append(ODConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num)) layers.extend([ # dw ODConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num), # pw-linear ODConv2d(hidden_dim, oup, 1, 1, 0, reduction=reduction, kernel_num=kernel_num), norm_layer(oup), ]) self.conv = nn.Sequential(*layers) def forward(self, x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) class OD_MobileNetV2(nn.Module): def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8, block=InvertedResidual, norm_layer=nn.BatchNorm2d, dropout=0.2, reduction=0.0625, kernel_num=1, **kwargs): """ MobileNet V2 main class Args: num_classes (int): Number of classes width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount inverted_residual_setting: Network structure round_nearest (int): Round the number of channels in each layer to be a multiple of this number Set to 1 to turn off rounding block: Module specifying inverted residual building block for mobilenet norm_layer: Module specifying the normalization layer to use """ super(OD_MobileNetV2, self).__init__() input_channel = 32 last_channel = 1280 if inverted_residual_setting is None: inverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1], ] # only check the first element, assuming user knows t,c,n,s are required if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: raise ValueError("inverted_residual_setting should be non-empty " "or a 4-element list, got {}".format(inverted_residual_setting)) # building first layer input_channel = _make_divisible(input_channel * width_mult, round_nearest) self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)] # building inverted residual blocks for t, c, n, s in inverted_residual_setting: output_channel = _make_divisible(c * width_mult, round_nearest) for i in range(n): stride = s if i == 0 else 1 features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num)) input_channel = output_channel # building last several layers features.append(ODConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer, reduction=reduction, kernel_num=kernel_num)) # make it nn.Sequential self.features = nn.Sequential(*features) # weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) self.channel = [i.size(1) for i in self.forward(torch.randn(2, 3, 640, 640))] def net_update_temperature(self, temperature): for m in self.modules(): if hasattr(m, "update_temperature"): m.update_temperature(temperature) def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] features = [None, None, None, None] for idx, layer in enumerate(self.features): x = layer(x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x return features def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k.replace('module.', '') in model_dict.keys() and np.shape(model_dict[k.replace('module.', '')]) == np.shape(v): temp_dict[k.replace('module.', '')] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def od_mobilenetv2_050(weights=None, kernel_num=1): model = OD_MobileNetV2(width_mult=0.5, kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_mobilenetv2_075(weights=None, kernel_num=1): model = OD_MobileNetV2(width_mult=0.75, kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_mobilenetv2_100(weights=None, kernel_num=1): model = OD_MobileNetV2(width_mult=1.0, kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model ================================================ FILE: yolo-improve/yolov5-backbone/ODConvFuse/od_resnet.py ================================================ import torch import torch.nn as nn from models.ODConv.odconv import ODConv2d import numpy as np __all__ = ['od_resnet18', 'od_resnet34', 'od_resnet50', 'od_resnet101'] def odconv3x3(in_planes, out_planes, stride=1, reduction=0.0625, kernel_num=1): return ODConv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, reduction=reduction, kernel_num=kernel_num) def odconv1x1(in_planes, out_planes, stride=1, reduction=0.0625, kernel_num=1): return ODConv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, reduction=reduction, kernel_num=kernel_num) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=0.0625, kernel_num=1): super(BasicBlock, self).__init__() self.conv1 = odconv3x3(inplanes, planes, stride, reduction=reduction, kernel_num=kernel_num) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = odconv3x3(planes, planes, reduction=reduction, kernel_num=kernel_num) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=0.0625, kernel_num=1): super(Bottleneck, self).__init__() self.conv1 = odconv1x1(inplanes, planes, reduction=reduction, kernel_num=kernel_num) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = odconv3x3(planes, planes, stride, reduction=reduction, kernel_num=kernel_num) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = odconv1x1(planes, planes * self.expansion, reduction=reduction, kernel_num=kernel_num) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class OD_ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, dropout=0.1, reduction=0.0625, kernel_num=1): super(OD_ResNet, self).__init__() self.inplanes = 64 self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0], reduction=reduction, kernel_num=kernel_num) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, reduction=reduction, kernel_num=kernel_num) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, reduction=reduction, kernel_num=kernel_num) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, reduction=reduction, kernel_num=kernel_num) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) self.channel = [i.size(1) for i in self.forward(torch.randn(2, 3, 640, 640))] def net_update_temperature(self, temperature): for m in self.modules(): if hasattr(m, "update_temperature"): m.update_temperature(temperature) def _make_layer(self, block, planes, blocks, stride=1, reduction=0.625, kernel_num=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, padding=0, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, reduction=reduction, kernel_num=kernel_num)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes, reduction=reduction, kernel_num=kernel_num)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x1 = self.relu(x) x = self.maxpool(x1) x2 = self.layer1(x) x3 = self.layer2(x2) x4 = self.layer3(x3) x5 = self.layer4(x4) return [x1, x2, x3, x4, x5] def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k.replace('module.', '') in model_dict.keys() and np.shape(model_dict[k.replace('module.', '')]) == np.shape(v): temp_dict[k.replace('module.', '')] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def od_resnet18(weights=None, kernel_num=1): model = OD_ResNet(BasicBlock, [2, 2, 2, 2], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_resnet34(weights=None, kernel_num=1): model = OD_ResNet(BasicBlock, [3, 4, 6, 3], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_resnet50(weights=None, kernel_num=1): model = OD_ResNet(Bottleneck, [3, 4, 6, 3], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def od_resnet101(weights=None, kernel_num=1): model = OD_ResNet(Bottleneck, [3, 4, 23, 3], kernel_num=kernel_num) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu')['state_dict'] model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model ================================================ FILE: yolo-improve/yolov5-backbone/ODConvFuse/odconv.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torch.autograd def fuse_conv_bn(conv, bn): # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ fusedconv = ( nn.Conv2d( conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, groups=conv.groups, bias=True, ) .requires_grad_(False) .to(conv.weight.device) ) # prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) # prepare spatial bias b_conv = ( torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias ) b_bn = bn.bias - bn.weight.mul(bn.running_mean).div( torch.sqrt(bn.running_var + bn.eps) ) fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv class Attention(nn.Module): def __init__(self, in_planes, out_planes, kernel_size, groups=1, reduction=0.0625, kernel_num=4, min_channel=16): super(Attention, self).__init__() attention_channel = max(int(in_planes * reduction), min_channel) self.kernel_size = kernel_size self.kernel_num = kernel_num self.temperature = 1.0 self.avgpool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Conv2d(in_planes, attention_channel, 1, bias=False) self.bn = nn.BatchNorm2d(attention_channel) self.relu = nn.ReLU(inplace=True) self.channel_fc = nn.Conv2d(attention_channel, in_planes, 1, bias=True) self.func_channel = self.get_channel_attention if in_planes == groups and in_planes == out_planes: # depth-wise convolution self.func_filter = self.skip else: self.filter_fc = nn.Conv2d(attention_channel, out_planes, 1, bias=True) self.func_filter = self.get_filter_attention if kernel_size == 1: # point-wise convolution self.func_spatial = self.skip else: self.spatial_fc = nn.Conv2d(attention_channel, kernel_size * kernel_size, 1, bias=True) self.func_spatial = self.get_spatial_attention if kernel_num == 1: self.func_kernel = self.skip else: self.kernel_fc = nn.Conv2d(attention_channel, kernel_num, 1, bias=True) self.func_kernel = self.get_kernel_attention self._initialize_weights() def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') if m.bias is not None: nn.init.constant_(m.bias, 0) if isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def update_temperature(self, temperature): self.temperature = temperature @staticmethod def skip(_): return 1.0 def get_channel_attention(self, x): channel_attention = torch.sigmoid(self.channel_fc(x).view(x.size(0), -1, 1, 1) / self.temperature) return channel_attention def get_filter_attention(self, x): filter_attention = torch.sigmoid(self.filter_fc(x).view(x.size(0), -1, 1, 1) / self.temperature) return filter_attention def get_spatial_attention(self, x): spatial_attention = self.spatial_fc(x).view(x.size(0), 1, 1, 1, self.kernel_size, self.kernel_size) spatial_attention = torch.sigmoid(spatial_attention / self.temperature) return spatial_attention def get_kernel_attention(self, x): kernel_attention = self.kernel_fc(x).view(x.size(0), -1, 1, 1, 1, 1) kernel_attention = F.softmax(kernel_attention / self.temperature, dim=1) return kernel_attention def forward(self, x): x = self.avgpool(x) x = self.fc(x) if hasattr(self, 'bn'): x = self.bn(x) x = self.relu(x) return self.func_channel(x), self.func_filter(x), self.func_spatial(x), self.func_kernel(x) def fuse(self): self.fc = fuse_conv_bn(self.fc, self.bn) del self.bn class ODConv2d(nn.Module): def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, reduction=0.0625, kernel_num=4): super(ODConv2d, self).__init__() self.in_planes = in_planes self.out_planes = out_planes self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.kernel_num = kernel_num self.attention = Attention(in_planes, out_planes, kernel_size, groups=groups, reduction=reduction, kernel_num=kernel_num) self.weight = nn.Parameter(torch.randn(kernel_num, out_planes, in_planes//groups, kernel_size, kernel_size), requires_grad=True) self._initialize_weights() if self.kernel_size == 1 and self.kernel_num == 1: self._forward_impl = self._forward_impl_pw1x else: self._forward_impl = self._forward_impl_common def _initialize_weights(self): for i in range(self.kernel_num): nn.init.kaiming_normal_(self.weight[i], mode='fan_out', nonlinearity='relu') def update_temperature(self, temperature): self.attention.update_temperature(temperature) def _forward_impl_common(self, x): # Multiplying channel attention (or filter attention) to weights and feature maps are equivalent, # while we observe that when using the latter method the models will run faster with less gpu memory cost. channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x) batch_size, in_planes, height, width = x.size() x = x * channel_attention x = x.reshape(1, -1, height, width) aggregate_weight = spatial_attention * kernel_attention * self.weight.unsqueeze(dim=0) aggregate_weight = torch.sum(aggregate_weight, dim=1).view( [-1, self.in_planes // self.groups, self.kernel_size, self.kernel_size]) output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups * batch_size) output = output.view(batch_size, self.out_planes, output.size(-2), output.size(-1)) output = output * filter_attention return output def _forward_impl_pw1x(self, x): channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x) x = x * channel_attention output = F.conv2d(x, weight=self.weight.squeeze(dim=0), bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups) output = output * filter_attention return output def forward(self, x): return self._forward_impl(x) ================================================ FILE: yolo-improve/yolov5-backbone/PoolFormer/poolformer.py ================================================ # Copyright 2021 Garena Online Private Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PoolFormer implementation """ import os import copy import torch import torch.nn as nn import numpy as np from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from timm.models.layers import DropPath, trunc_normal_, to_2tuple from timm.models.registry import register_model __all__ = ['poolformer_s12', 'poolformer_s24', 'poolformer_s36', 'poolformer_m48', 'poolformer_m36'] def _cfg(url='', **kwargs): return { 'url': url, 'num_classes': 1000, 'pool_size': None, 'crop_pct': .95, 'interpolation': 'bicubic', 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'classifier': 'head', **kwargs } default_cfgs = { 'poolformer_s': _cfg(crop_pct=0.9), 'poolformer_m': _cfg(crop_pct=0.95), } class PatchEmbed(nn.Module): """ Patch Embedding that is implemented by a layer of conv. Input: tensor in shape [B, C, H, W] Output: tensor in shape [B, C, H/stride, W/stride] """ def __init__(self, patch_size=16, stride=16, padding=0, in_chans=3, embed_dim=768, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) stride = to_2tuple(stride) padding = to_2tuple(padding) self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): x = self.proj(x) x = self.norm(x) return x class LayerNormChannel(nn.Module): """ LayerNorm only for Channel Dimension. Input: tensor in shape [B, C, H, W] """ def __init__(self, num_channels, eps=1e-05): super().__init__() self.weight = nn.Parameter(torch.ones(num_channels)) self.bias = nn.Parameter(torch.zeros(num_channels)) self.eps = eps def forward(self, x): u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / torch.sqrt(s + self.eps) x = self.weight.unsqueeze(-1).unsqueeze(-1) * x \ + self.bias.unsqueeze(-1).unsqueeze(-1) return x class GroupNorm(nn.GroupNorm): """ Group Normalization with 1 group. Input: tensor in shape [B, C, H, W] """ def __init__(self, num_channels, **kwargs): super().__init__(1, num_channels, **kwargs) class Pooling(nn.Module): """ Implementation of pooling for PoolFormer --pool_size: pooling size """ def __init__(self, pool_size=3): super().__init__() self.pool = nn.AvgPool2d( pool_size, stride=1, padding=pool_size//2, count_include_pad=False) def forward(self, x): return self.pool(x) - x class Mlp(nn.Module): """ Implementation of MLP with 1*1 convolutions. Input: tensor with shape [B, C, H, W] """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.act = act_layer() self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Conv2d): trunc_normal_(m.weight, std=.02) if m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class PoolFormerBlock(nn.Module): """ Implementation of one PoolFormer block. --dim: embedding dim --pool_size: pooling size --mlp_ratio: mlp expansion ratio --act_layer: activation --norm_layer: normalization --drop: dropout rate --drop path: Stochastic Depth, refer to https://arxiv.org/abs/1603.09382 --use_layer_scale, --layer_scale_init_value: LayerScale, refer to https://arxiv.org/abs/2103.17239 """ def __init__(self, dim, pool_size=3, mlp_ratio=4., act_layer=nn.GELU, norm_layer=GroupNorm, drop=0., drop_path=0., use_layer_scale=True, layer_scale_init_value=1e-5): super().__init__() self.norm1 = norm_layer(dim) self.token_mixer = Pooling(pool_size=pool_size) self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) # The following two techniques are useful to train deep PoolFormers. self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.use_layer_scale = use_layer_scale if use_layer_scale: self.layer_scale_1 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) def forward(self, x): if self.use_layer_scale: x = x + self.drop_path( self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.token_mixer(self.norm1(x))) x = x + self.drop_path( self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.token_mixer(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x def basic_blocks(dim, index, layers, pool_size=3, mlp_ratio=4., act_layer=nn.GELU, norm_layer=GroupNorm, drop_rate=.0, drop_path_rate=0., use_layer_scale=True, layer_scale_init_value=1e-5): """ generate PoolFormer blocks for a stage return: PoolFormer blocks """ blocks = [] for block_idx in range(layers[index]): block_dpr = drop_path_rate * ( block_idx + sum(layers[:index])) / (sum(layers) - 1) blocks.append(PoolFormerBlock( dim, pool_size=pool_size, mlp_ratio=mlp_ratio, act_layer=act_layer, norm_layer=norm_layer, drop=drop_rate, drop_path=block_dpr, use_layer_scale=use_layer_scale, layer_scale_init_value=layer_scale_init_value, )) blocks = nn.Sequential(*blocks) return blocks class PoolFormer(nn.Module): """ PoolFormer, the main class of our model --layers: [x,x,x,x], number of blocks for the 4 stages --embed_dims, --mlp_ratios, --pool_size: the embedding dims, mlp ratios and pooling size for the 4 stages --downsamples: flags to apply downsampling or not --norm_layer, --act_layer: define the types of normalization and activation --num_classes: number of classes for the image classification --in_patch_size, --in_stride, --in_pad: specify the patch embedding for the input image --down_patch_size --down_stride --down_pad: specify the downsample (patch embed.) --fork_feat: whether output features of the 4 stages, for dense prediction --init_cfg, --pretrained: for mmdetection and mmsegmentation to load pretrained weights """ def __init__(self, layers, embed_dims=None, mlp_ratios=None, downsamples=None, pool_size=3, norm_layer=GroupNorm, act_layer=nn.GELU, num_classes=1000, in_patch_size=7, in_stride=4, in_pad=2, down_patch_size=3, down_stride=2, down_pad=1, drop_rate=0., drop_path_rate=0., use_layer_scale=True, layer_scale_init_value=1e-5, fork_feat=True, init_cfg=None, pretrained=None, **kwargs): super().__init__() if not fork_feat: self.num_classes = num_classes self.fork_feat = fork_feat self.patch_embed = PatchEmbed( patch_size=in_patch_size, stride=in_stride, padding=in_pad, in_chans=3, embed_dim=embed_dims[0]) # set the main block in network network = [] for i in range(len(layers)): stage = basic_blocks(embed_dims[i], i, layers, pool_size=pool_size, mlp_ratio=mlp_ratios[i], act_layer=act_layer, norm_layer=norm_layer, drop_rate=drop_rate, drop_path_rate=drop_path_rate, use_layer_scale=use_layer_scale, layer_scale_init_value=layer_scale_init_value) network.append(stage) if i >= len(layers) - 1: break if downsamples[i] or embed_dims[i] != embed_dims[i+1]: # downsampling between two stages network.append( PatchEmbed( patch_size=down_patch_size, stride=down_stride, padding=down_pad, in_chans=embed_dims[i], embed_dim=embed_dims[i+1] ) ) self.network = nn.ModuleList(network) if self.fork_feat: # add a norm layer for each output self.out_indices = [0, 2, 4, 6] for i_emb, i_layer in enumerate(self.out_indices): if i_emb == 0 and os.environ.get('FORK_LAST3', None): # TODO: more elegant way """For RetinaNet, `start_level=1`. The first norm layer will not used. cmd: `FORK_LAST3=1 python -m torch.distributed.launch ...` """ layer = nn.Identity() else: layer = norm_layer(embed_dims[i_emb]) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) else: # Classifier head self.norm = norm_layer(embed_dims[-1]) self.head = nn.Linear( embed_dims[-1], num_classes) if num_classes > 0 \ else nn.Identity() self.init_cfg = copy.deepcopy(init_cfg) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 224, 224))] def reset_classifier(self, num_classes): self.num_classes = num_classes self.head = nn.Linear( self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward_embeddings(self, x): x = self.patch_embed(x) return x def forward_tokens(self, x): outs = [] for idx, block in enumerate(self.network): x = block(x) if self.fork_feat and idx in self.out_indices: norm_layer = getattr(self, f'norm{idx}') x_out = norm_layer(x) outs.append(x_out) return outs def forward(self, x): # input embedding x = self.forward_embeddings(x) # through backbone x = self.forward_tokens(x) return x model_urls = { "poolformer_s12": "https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s12.pth.tar", "poolformer_s24": "https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s24.pth.tar", "poolformer_s36": "https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s36.pth.tar", "poolformer_m36": "https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m36.pth.tar", "poolformer_m48": "https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m48.pth.tar", } def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def poolformer_s12(pretrained=False, **kwargs): """ PoolFormer-S12 model, Params: 12M --layers: [x,x,x,x], numbers of layers for the four stages --embed_dims, --mlp_ratios: embedding dims and mlp ratios for the four stages --downsamples: flags to apply downsampling or not in four blocks """ layers = [2, 2, 6, 2] embed_dims = [64, 128, 320, 512] mlp_ratios = [4, 4, 4, 4] downsamples = [True, True, True, True] model = PoolFormer( layers, embed_dims=embed_dims, mlp_ratios=mlp_ratios, downsamples=downsamples, **kwargs) model.default_cfg = default_cfgs['poolformer_s'] if pretrained: url = model_urls['poolformer_s12'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint)) return model def poolformer_s24(pretrained=False, **kwargs): """ PoolFormer-S24 model, Params: 21M """ layers = [4, 4, 12, 4] embed_dims = [64, 128, 320, 512] mlp_ratios = [4, 4, 4, 4] downsamples = [True, True, True, True] model = PoolFormer( layers, embed_dims=embed_dims, mlp_ratios=mlp_ratios, downsamples=downsamples, **kwargs) model.default_cfg = default_cfgs['poolformer_s'] if pretrained: url = model_urls['poolformer_s24'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint)) return model def poolformer_s36(pretrained=False, **kwargs): """ PoolFormer-S36 model, Params: 31M """ layers = [6, 6, 18, 6] embed_dims = [64, 128, 320, 512] mlp_ratios = [4, 4, 4, 4] downsamples = [True, True, True, True] model = PoolFormer( layers, embed_dims=embed_dims, mlp_ratios=mlp_ratios, downsamples=downsamples, layer_scale_init_value=1e-6, **kwargs) model.default_cfg = default_cfgs['poolformer_s'] if pretrained: url = model_urls['poolformer_s36'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint)) return model def poolformer_m36(pretrained=False, **kwargs): """ PoolFormer-M36 model, Params: 56M """ layers = [6, 6, 18, 6] embed_dims = [96, 192, 384, 768] mlp_ratios = [4, 4, 4, 4] downsamples = [True, True, True, True] model = PoolFormer( layers, embed_dims=embed_dims, mlp_ratios=mlp_ratios, downsamples=downsamples, layer_scale_init_value=1e-6, **kwargs) model.default_cfg = default_cfgs['poolformer_m'] if pretrained: url = model_urls['poolformer_m36'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint)) return model @register_model def poolformer_m48(pretrained=False, **kwargs): """ PoolFormer-M48 model, Params: 73M """ layers = [8, 8, 24, 8] embed_dims = [96, 192, 384, 768] mlp_ratios = [4, 4, 4, 4] downsamples = [True, True, True, True] model = PoolFormer( layers, embed_dims=embed_dims, mlp_ratios=mlp_ratios, downsamples=downsamples, layer_scale_init_value=1e-6, **kwargs) model.default_cfg = default_cfgs['poolformer_m'] if pretrained: url = model_urls['poolformer_m48'] checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) model.load_state_dict(update_weight(model.state_dict(), checkpoint)) return model if __name__ == '__main__': model = poolformer_s12(pretrained=True) inputs = torch.randn((1, 3, 640, 640)) for i in model(inputs): print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/RIFormer/RIFormer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Sequence import torch import torch.nn as nn import numpy as np from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer from mmengine.model import BaseModule __all__ = ['RIFormer'] class Mlp(nn.Module): """Mlp implemented by with 1*1 convolutions. Input: Tensor with shape [B, C, H, W]. Output: Tensor with shape [B, C, H, W]. Args: in_features (int): Dimension of input features. hidden_features (int): Dimension of hidden features. out_features (int): Dimension of output features. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. drop (float): Dropout rate. Defaults to 0.0. """ def __init__(self, in_features, hidden_features=None, out_features=None, act_cfg=dict(type='GELU'), drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.act = build_activation_layer(act_cfg) self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class PatchEmbed(nn.Module): """Patch Embedding module implemented by a layer of convolution. Input: tensor in shape [B, C, H, W] Output: tensor in shape [B, C, H/stride, W/stride] Args: patch_size (int): Patch size of the patch embedding. Defaults to 16. stride (int): Stride of the patch embedding. Defaults to 16. padding (int): Padding of the patch embedding. Defaults to 0. in_chans (int): Input channels. Defaults to 3. embed_dim (int): Output dimension of the patch embedding. Defaults to 768. norm_layer (module): Normalization module. Defaults to None (not use). """ def __init__(self, patch_size=16, stride=16, padding=0, in_chans=3, embed_dim=768, norm_layer=None): super().__init__() self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): x = self.proj(x) x = self.norm(x) return x class Affine(nn.Module): """Affine Transformation module. Args: in_features (int): Input dimension. """ def __init__(self, in_features): super().__init__() self.affine = nn.Conv2d( in_features, in_features, kernel_size=1, stride=1, padding=0, groups=in_features, bias=True) def forward(self, x): return self.affine(x) - x class RIFormerBlock(BaseModule): """RIFormer Block. Args: dim (int): Embedding dim. mlp_ratio (float): Mlp expansion ratio. Defaults to 4. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='GN', num_groups=1)``. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. drop (float): Dropout rate. Defaults to 0. drop_path (float): Stochastic depth rate. Defaults to 0. layer_scale_init_value (float): Init value for Layer Scale. Defaults to 1e-5. deploy (bool): Whether to switch the model structure to deployment mode. Default: False. """ def __init__(self, dim, mlp_ratio=4., norm_cfg=dict(type='GN', num_groups=1), act_cfg=dict(type='GELU'), drop=0., drop_path=0., layer_scale_init_value=1e-5, deploy=False): super().__init__() if deploy: self.norm_reparam = build_norm_layer(norm_cfg, dim)[1] else: self.norm1 = build_norm_layer(norm_cfg, dim)[1] self.token_mixer = Affine(in_features=dim) self.norm2 = build_norm_layer(norm_cfg, dim)[1] mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_cfg=act_cfg, drop=drop) # The following two techniques are useful to train deep RIFormers. self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.layer_scale_1 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.layer_scale_2 = nn.Parameter( layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.norm_cfg = norm_cfg self.dim = dim self.deploy = deploy def forward(self, x): if hasattr(self, 'norm_reparam'): x = x + self.drop_path( self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.norm_reparam(x)) x = x + self.drop_path( self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) else: x = x + self.drop_path( self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.token_mixer(self.norm1(x))) x = x + self.drop_path( self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) return x def fuse_affine(self, norm, token_mixer): gamma_affn = token_mixer.affine.weight.reshape(-1) gamma_affn = gamma_affn - torch.ones_like(gamma_affn) beta_affn = token_mixer.affine.bias gamma_ln = norm.weight beta_ln = norm.bias return (gamma_ln * gamma_affn), (beta_ln * gamma_affn + beta_affn) def get_equivalent_scale_bias(self): eq_s, eq_b = self.fuse_affine(self.norm1, self.token_mixer) return eq_s, eq_b def switch_to_deploy(self): if self.deploy: return eq_s, eq_b = self.get_equivalent_scale_bias() self.norm_reparam = build_norm_layer(self.norm_cfg, self.dim)[1] self.norm_reparam.weight.data = eq_s self.norm_reparam.bias.data = eq_b self.__delattr__('norm1') if hasattr(self, 'token_mixer'): self.__delattr__('token_mixer') self.deploy = True def basic_blocks(dim, index, layers, mlp_ratio=4., norm_cfg=dict(type='GN', num_groups=1), act_cfg=dict(type='GELU'), drop_rate=.0, drop_path_rate=0., layer_scale_init_value=1e-5, deploy=False): """generate RIFormer blocks for a stage.""" blocks = [] for block_idx in range(layers[index]): block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / ( sum(layers) - 1) blocks.append( RIFormerBlock( dim, mlp_ratio=mlp_ratio, norm_cfg=norm_cfg, act_cfg=act_cfg, drop=drop_rate, drop_path=block_dpr, layer_scale_init_value=layer_scale_init_value, deploy=deploy, )) blocks = nn.Sequential(*blocks) return blocks def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): k = k[9:] if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict class RIFormer(nn.Module): """RIFormer. A PyTorch implementation of RIFormer introduced by: `RIFormer: Keep Your Vision Backbone Effective But Removing Token Mixer `_ Args: arch (str | dict): The model's architecture. If string, it should be one of architecture in ``RIFormer.arch_settings``. And if dict, it should include the following two keys: - layers (list[int]): Number of blocks at each stage. - embed_dims (list[int]): The number of channels at each stage. - mlp_ratios (list[int]): Expansion ratio of MLPs. - layer_scale_init_value (float): Init value for Layer Scale. Defaults to 'S12'. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='LN2d', eps=1e-6)``. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. in_patch_size (int): The patch size of/? input image patch embedding. Defaults to 7. in_stride (int): The stride of input image patch embedding. Defaults to 4. in_pad (int): The padding of input image patch embedding. Defaults to 2. down_patch_size (int): The patch size of downsampling patch embedding. Defaults to 3. down_stride (int): The stride of downsampling patch embedding. Defaults to 2. down_pad (int): The padding of downsampling patch embedding. Defaults to 1. drop_rate (float): Dropout rate. Defaults to 0. drop_path_rate (float): Stochastic depth rate. Defaults to 0. out_indices (Sequence | int): Output from which network position. Index 0-6 respectively corresponds to [stage1, downsampling, stage2, downsampling, stage3, downsampling, stage4] Defaults to -1, means the last stage. frozen_stages (int): Stages to be frozen (all param fixed). Defaults to -1, which means not freezing any parameters. deploy (bool): Whether to switch the model structure to deployment mode. Default: False. init_cfg (dict, optional): Initialization config dict """ # noqa: E501 # --layers: [x,x,x,x], numbers of layers for the four stages # --embed_dims, --mlp_ratios: # embedding dims and mlp ratios for the four stages # --downsamples: flags to apply downsampling or not in four blocks arch_settings = { 's12': { 'layers': [2, 2, 6, 2], 'embed_dims': [64, 128, 320, 512], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-5, }, 's24': { 'layers': [4, 4, 12, 4], 'embed_dims': [64, 128, 320, 512], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-5, }, 's36': { 'layers': [6, 6, 18, 6], 'embed_dims': [64, 128, 320, 512], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-6, }, 'm36': { 'layers': [6, 6, 18, 6], 'embed_dims': [96, 192, 384, 768], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-6, }, 'm48': { 'layers': [8, 8, 24, 8], 'embed_dims': [96, 192, 384, 768], 'mlp_ratios': [4, 4, 4, 4], 'layer_scale_init_value': 1e-6, }, } def __init__(self, arch='s12', weights = '', in_channels=3, norm_cfg=dict(type='GN', num_groups=1), act_cfg=dict(type='GELU'), in_patch_size=7, in_stride=4, in_pad=2, down_patch_size=3, down_stride=2, down_pad=1, drop_rate=0., drop_path_rate=0., out_indices=[0, 2, 4, 6], deploy=False): super().__init__() if isinstance(arch, str): assert arch in self.arch_settings, \ f'Unavailable arch, please choose from ' \ f'({set(self.arch_settings)}) or pass a dict.' arch = self.arch_settings[arch] elif isinstance(arch, dict): assert 'layers' in arch and 'embed_dims' in arch, \ f'The arch dict must have "layers" and "embed_dims", ' \ f'but got {list(arch.keys())}.' layers = arch['layers'] embed_dims = arch['embed_dims'] mlp_ratios = arch['mlp_ratios'] \ if 'mlp_ratios' in arch else [4, 4, 4, 4] layer_scale_init_value = arch['layer_scale_init_value'] \ if 'layer_scale_init_value' in arch else 1e-5 self.patch_embed = PatchEmbed( patch_size=in_patch_size, stride=in_stride, padding=in_pad, in_chans=in_channels, embed_dim=embed_dims[0]) # set the main block in network network = [] for i in range(len(layers)): stage = basic_blocks( embed_dims[i], i, layers, mlp_ratio=mlp_ratios[i], norm_cfg=norm_cfg, act_cfg=act_cfg, drop_rate=drop_rate, drop_path_rate=drop_path_rate, layer_scale_init_value=layer_scale_init_value, deploy=deploy) network.append(stage) if i >= len(layers) - 1: break if embed_dims[i] != embed_dims[i + 1]: # downsampling between two stages network.append( PatchEmbed( patch_size=down_patch_size, stride=down_stride, padding=down_pad, in_chans=embed_dims[i], embed_dim=embed_dims[i + 1])) self.network = nn.ModuleList(network) if isinstance(out_indices, int): out_indices = [out_indices] assert isinstance(out_indices, Sequence), \ f'"out_indices" must by a sequence or int, ' \ f'get {type(out_indices)} instead.' for i, index in enumerate(out_indices): if index < 0: out_indices[i] = 7 + index assert out_indices[i] >= 0, f'Invalid out_indices {index}' self.out_indices = out_indices if self.out_indices: for i_layer in self.out_indices: layer = build_norm_layer(norm_cfg, embed_dims[(i_layer + 1) // 2])[1] layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.deploy = deploy if weights: self.load_state_dict(update_weight(self.state_dict(), torch.load(weights)['state_dict'])) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward_embeddings(self, x): x = self.patch_embed(x) return x def forward_tokens(self, x): outs = [] for idx, block in enumerate(self.network): x = block(x) if idx in self.out_indices: norm_layer = getattr(self, f'norm{idx}') x_out = norm_layer(x) outs.append(x_out) return outs def forward(self, x): # input embedding x = self.forward_embeddings(x) # through backbone x = self.forward_tokens(x) return x if __name__ == '__main__': model = RIFormer('s12', 'riformer-s12_32xb128_in1k-384px_20230406-145eda4c.pth') inputs = torch.randn((1, 3, 640, 640)) for i in model(inputs): print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/RepViT/repvit.py ================================================ import torch.nn as nn import numpy as np from timm.models.layers import SqueezeExcite import torch __all__ = ['repvit_m0_9', 'repvit_m1_0', 'repvit_m1_1', 'repvit_m1_5', 'repvit_m2_3'] def replace_batchnorm(net): for child_name, child in net.named_children(): if hasattr(child, 'fuse_self'): fused = child.fuse_self() setattr(net, child_name, fused) replace_batchnorm(fused) elif isinstance(child, torch.nn.BatchNorm2d): setattr(net, child_name, torch.nn.Identity()) else: replace_batchnorm(child) def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py :param v: :param divisor: :param min_value: :return: """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class Conv2d_BN(torch.nn.Sequential): def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1, resolution=-10000): super().__init__() self.add_module('c', torch.nn.Conv2d( a, b, ks, stride, pad, dilation, groups, bias=False)) self.add_module('bn', torch.nn.BatchNorm2d(b)) torch.nn.init.constant_(self.bn.weight, bn_weight_init) torch.nn.init.constant_(self.bn.bias, 0) @torch.no_grad() def fuse_self(self): c, bn = self._modules.values() w = bn.weight / (bn.running_var + bn.eps)**0.5 w = c.weight * w[:, None, None, None] b = bn.bias - bn.running_mean * bn.weight / \ (bn.running_var + bn.eps)**0.5 m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size( 0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups, device=c.weight.device) m.weight.data.copy_(w) m.bias.data.copy_(b) return m class Residual(torch.nn.Module): def __init__(self, m, drop=0.): super().__init__() self.m = m self.drop = drop def forward(self, x): if self.training and self.drop > 0: return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1, device=x.device).ge_(self.drop).div(1 - self.drop).detach() else: return x + self.m(x) @torch.no_grad() def fuse_self(self): if isinstance(self.m, Conv2d_BN): m = self.m.fuse_self() assert(m.groups == m.in_channels) identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1) identity = torch.nn.functional.pad(identity, [1,1,1,1]) m.weight += identity.to(m.weight.device) return m elif isinstance(self.m, torch.nn.Conv2d): m = self.m assert(m.groups != m.in_channels) identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1) identity = torch.nn.functional.pad(identity, [1,1,1,1]) m.weight += identity.to(m.weight.device) return m else: return self class RepVGGDW(torch.nn.Module): def __init__(self, ed) -> None: super().__init__() self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed) self.conv1 = torch.nn.Conv2d(ed, ed, 1, 1, 0, groups=ed) self.dim = ed self.bn = torch.nn.BatchNorm2d(ed) def forward(self, x): return self.bn((self.conv(x) + self.conv1(x)) + x) @torch.no_grad() def fuse_self(self): conv = self.conv.fuse_self() conv1 = self.conv1 conv_w = conv.weight conv_b = conv.bias conv1_w = conv1.weight conv1_b = conv1.bias conv1_w = torch.nn.functional.pad(conv1_w, [1,1,1,1]) identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device), [1,1,1,1]) final_conv_w = conv_w + conv1_w + identity final_conv_b = conv_b + conv1_b conv.weight.data.copy_(final_conv_w) conv.bias.data.copy_(final_conv_b) bn = self.bn w = bn.weight / (bn.running_var + bn.eps)**0.5 w = conv.weight * w[:, None, None, None] b = bn.bias + (conv.bias - bn.running_mean) * bn.weight / \ (bn.running_var + bn.eps)**0.5 conv.weight.data.copy_(w) conv.bias.data.copy_(b) return conv class RepViTBlock(nn.Module): def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs): super(RepViTBlock, self).__init__() assert stride in [1, 2] self.identity = stride == 1 and inp == oup assert(hidden_dim == 2 * inp) if stride == 2: self.token_mixer = nn.Sequential( Conv2d_BN(inp, inp, kernel_size, stride, (kernel_size - 1) // 2, groups=inp), SqueezeExcite(inp, 0.25) if use_se else nn.Identity(), Conv2d_BN(inp, oup, ks=1, stride=1, pad=0) ) self.channel_mixer = Residual(nn.Sequential( # pw Conv2d_BN(oup, 2 * oup, 1, 1, 0), nn.GELU() if use_hs else nn.GELU(), # pw-linear Conv2d_BN(2 * oup, oup, 1, 1, 0, bn_weight_init=0), )) else: assert(self.identity) self.token_mixer = nn.Sequential( RepVGGDW(inp), SqueezeExcite(inp, 0.25) if use_se else nn.Identity(), ) self.channel_mixer = Residual(nn.Sequential( # pw Conv2d_BN(inp, hidden_dim, 1, 1, 0), nn.GELU() if use_hs else nn.GELU(), # pw-linear Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0), )) def forward(self, x): return self.channel_mixer(self.token_mixer(x)) class RepViT(nn.Module): def __init__(self, cfgs): super(RepViT, self).__init__() # setting of inverted residual blocks self.cfgs = cfgs # building first layer input_channel = self.cfgs[0][2] patch_embed = torch.nn.Sequential(Conv2d_BN(3, input_channel // 2, 3, 2, 1), torch.nn.GELU(), Conv2d_BN(input_channel // 2, input_channel, 3, 2, 1)) layers = [patch_embed] # building inverted residual blocks block = RepViTBlock for k, t, c, use_se, use_hs, s in self.cfgs: output_channel = _make_divisible(c, 8) exp_size = _make_divisible(input_channel * t, 8) layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs)) input_channel = output_channel self.features = nn.ModuleList(layers) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] features = [None, None, None, None] for f in self.features: x = f(x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x return features def switch_to_deploy(self): replace_batchnorm(self) def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): # k = k[9:] if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def repvit_m0_9(weights=''): """ Constructs a MobileNetV3-Large model """ cfgs = [ # k, t, c, SE, HS, s [3, 2, 48, 1, 0, 1], [3, 2, 48, 0, 0, 1], [3, 2, 48, 0, 0, 1], [3, 2, 96, 0, 0, 2], [3, 2, 96, 1, 0, 1], [3, 2, 96, 0, 0, 1], [3, 2, 96, 0, 0, 1], [3, 2, 192, 0, 1, 2], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 1, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 192, 0, 1, 1], [3, 2, 384, 0, 1, 2], [3, 2, 384, 1, 1, 1], [3, 2, 384, 0, 1, 1] ] model = RepViT(cfgs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def repvit_m1_0(weights=''): """ Constructs a MobileNetV3-Large model """ cfgs = [ # k, t, c, SE, HS, s [3, 2, 56, 1, 0, 1], [3, 2, 56, 0, 0, 1], [3, 2, 56, 0, 0, 1], [3, 2, 112, 0, 0, 2], [3, 2, 112, 1, 0, 1], [3, 2, 112, 0, 0, 1], [3, 2, 112, 0, 0, 1], [3, 2, 224, 0, 1, 2], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 1, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 224, 0, 1, 1], [3, 2, 448, 0, 1, 2], [3, 2, 448, 1, 1, 1], [3, 2, 448, 0, 1, 1] ] model = RepViT(cfgs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def repvit_m1_1(weights=''): """ Constructs a MobileNetV3-Large model """ cfgs = [ # k, t, c, SE, HS, s [3, 2, 64, 1, 0, 1], [3, 2, 64, 0, 0, 1], [3, 2, 64, 0, 0, 1], [3, 2, 128, 0, 0, 2], [3, 2, 128, 1, 0, 1], [3, 2, 128, 0, 0, 1], [3, 2, 128, 0, 0, 1], [3, 2, 256, 0, 1, 2], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 512, 0, 1, 2], [3, 2, 512, 1, 1, 1], [3, 2, 512, 0, 1, 1] ] model = RepViT(cfgs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def repvit_m1_5(weights=''): """ Constructs a MobileNetV3-Large model """ cfgs = [ # k, t, c, SE, HS, s [3, 2, 64, 1, 0, 1], [3, 2, 64, 0, 0, 1], [3, 2, 64, 1, 0, 1], [3, 2, 64, 0, 0, 1], [3, 2, 64, 0, 0, 1], [3, 2, 128, 0, 0, 2], [3, 2, 128, 1, 0, 1], [3, 2, 128, 0, 0, 1], [3, 2, 128, 1, 0, 1], [3, 2, 128, 0, 0, 1], [3, 2, 128, 0, 0, 1], [3, 2, 256, 0, 1, 2], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 1, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 256, 0, 1, 1], [3, 2, 512, 0, 1, 2], [3, 2, 512, 1, 1, 1], [3, 2, 512, 0, 1, 1], [3, 2, 512, 1, 1, 1], [3, 2, 512, 0, 1, 1] ] model = RepViT(cfgs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model def repvit_m2_3(weights=''): """ Constructs a MobileNetV3-Large model """ cfgs = [ # k, t, c, SE, HS, s [3, 2, 80, 1, 0, 1], [3, 2, 80, 0, 0, 1], [3, 2, 80, 1, 0, 1], [3, 2, 80, 0, 0, 1], [3, 2, 80, 1, 0, 1], [3, 2, 80, 0, 0, 1], [3, 2, 80, 0, 0, 1], [3, 2, 160, 0, 0, 2], [3, 2, 160, 1, 0, 1], [3, 2, 160, 0, 0, 1], [3, 2, 160, 1, 0, 1], [3, 2, 160, 0, 0, 1], [3, 2, 160, 1, 0, 1], [3, 2, 160, 0, 0, 1], [3, 2, 160, 0, 0, 1], [3, 2, 320, 0, 1, 2], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 320, 1, 1, 1], [3, 2, 320, 0, 1, 1], # [3, 2, 320, 1, 1, 1], # [3, 2, 320, 0, 1, 1], [3, 2, 320, 0, 1, 1], [3, 2, 640, 0, 1, 2], [3, 2, 640, 1, 1, 1], [3, 2, 640, 0, 1, 1], # [3, 2, 640, 1, 1, 1], # [3, 2, 640, 0, 1, 1] ] model = RepViT(cfgs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model if __name__ == '__main__': model = repvit_m2_3('repvit_m2_3_distill_450e.pth') inputs = torch.randn((1, 3, 640, 640)) res = model(inputs) for i in res: print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/SwinTransformer/SwinTransformer.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint import numpy as np from timm.models.layers import DropPath, to_2tuple, trunc_normal_ __all__ = ['SwinTransformer_Tiny'] class Mlp(nn.Module): """ Multilayer perceptron.""" def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """ Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """ Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """ Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.H = None self.W = None def forward(self, x, mask_matrix): """ Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix.type(x.dtype) else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """ Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """ Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """ A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, dim, depth, num_heads, window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList([ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer) for i in range(depth)]) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """ Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """ Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) return x class SwinTransformer(nn.Module): """ Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, use_checkpoint=False): super().__init__() self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]] self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])) trunc_normal_(self.absolute_pos_embed, std=.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint) self.layers.append(layer) num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x): """Forward function.""" x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = [] for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) return outs def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def SwinTransformer_Tiny(weights=''): model = SwinTransformer(depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24]) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights)['model'])) return model if __name__ == '__main__': device = torch.device('cuda:0') model = SwinTransformer().to(device) model.half() # model.load_state_dict(update_weight(model.state_dict(), torch.load('swin_tiny_patch4_window7_224_22k.pth')['model'])) inputs = torch.randn((1, 3, 640, 512)).to(device).half() res = model(inputs) for i in res: print(i.size()) print(model.channel) ================================================ FILE: yolo-improve/yolov5-backbone/UniRepLKNet/unireplknet.py ================================================ # UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio, Video, Point Cloud, Time-Series and Image Recognition # Github source: https://github.com/AILab-CVC/UniRepLKNet # Licensed under The Apache License 2.0 License [see LICENSE for details] # Based on RepLKNet, ConvNeXt, timm, DINO and DeiT code bases # https://github.com/DingXiaoH/RepLKNet-pytorch # https://github.com/facebookresearch/ConvNeXt # https://github.com/rwightman/pytorch-image-models/tree/master/timm # https://github.com/facebookresearch/deit/ # https://github.com/facebookresearch/dino # --------------------------------------------------------' import torch import torch.nn as nn import torch.nn.functional as F from timm.layers import trunc_normal_, DropPath, to_2tuple from functools import partial import torch.utils.checkpoint as checkpoint import numpy as np __all__ = ['unireplknet_a', 'unireplknet_f', 'unireplknet_p', 'unireplknet_n', 'unireplknet_t', 'unireplknet_s', 'unireplknet_b', 'unireplknet_l', 'unireplknet_xl'] class GRNwithNHWC(nn.Module): """ GRN (Global Response Normalization) layer Originally proposed in ConvNeXt V2 (https://arxiv.org/abs/2301.00808) This implementation is more efficient than the original (https://github.com/facebookresearch/ConvNeXt-V2) We assume the inputs to this layer are (N, H, W, C) """ def __init__(self, dim, use_bias=True): super().__init__() self.use_bias = use_bias self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim)) if self.use_bias: self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim)) def forward(self, x): Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True) Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6) if self.use_bias: return (self.gamma * Nx + 1) * x + self.beta else: return (self.gamma * Nx + 1) * x class NCHWtoNHWC(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x.permute(0, 2, 3, 1) class NHWCtoNCHW(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x.permute(0, 3, 1, 2) #================== This function decides which conv implementation (the native or iGEMM) to use # Note that iGEMM large-kernel conv impl will be used if # - you attempt to do so (attempt_to_use_large_impl=True), and # - it has been installed (follow https://github.com/AILab-CVC/UniRepLKNet), and # - the conv layer is depth-wise, stride = 1, non-dilated, kernel_size > 5, and padding == kernel_size // 2 def get_conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, attempt_use_lk_impl=True): kernel_size = to_2tuple(kernel_size) if padding is None: padding = (kernel_size[0] // 2, kernel_size[1] // 2) else: padding = to_2tuple(padding) need_large_impl = kernel_size[0] == kernel_size[1] and kernel_size[0] > 5 and padding == (kernel_size[0] // 2, kernel_size[1] // 2) # if attempt_use_lk_impl and need_large_impl: # print('---------------- trying to import iGEMM implementation for large-kernel conv') # try: # from depthwise_conv2d_implicit_gemm import DepthWiseConv2dImplicitGEMM # print('---------------- found iGEMM implementation ') # except: # DepthWiseConv2dImplicitGEMM = None # print('---------------- found no iGEMM. use original conv. follow https://github.com/AILab-CVC/UniRepLKNet to install it.') # if DepthWiseConv2dImplicitGEMM is not None and need_large_impl and in_channels == out_channels \ # and out_channels == groups and stride == 1 and dilation == 1: # print(f'===== iGEMM Efficient Conv Impl, channels {in_channels}, kernel size {kernel_size} =====') # return DepthWiseConv2dImplicitGEMM(in_channels, kernel_size, bias=bias) return nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) def get_bn(dim, use_sync_bn=False): if use_sync_bn: return nn.SyncBatchNorm(dim) else: return nn.BatchNorm2d(dim) class SEBlock(nn.Module): """ Squeeze-and-Excitation Block proposed in SENet (https://arxiv.org/abs/1709.01507) We assume the inputs to this layer are (N, C, H, W) """ def __init__(self, input_channels, internal_neurons): super(SEBlock, self).__init__() self.down = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True) self.up = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels, kernel_size=1, stride=1, bias=True) self.input_channels = input_channels self.nonlinear = nn.ReLU(inplace=True) def forward(self, inputs): x = F.adaptive_avg_pool2d(inputs, output_size=(1, 1)) x = self.down(x) x = self.nonlinear(x) x = self.up(x) x = F.sigmoid(x) return inputs * x.view(-1, self.input_channels, 1, 1) def fuse_bn(conv, bn): conv_bias = 0 if conv.bias is None else conv.bias std = (bn.running_var + bn.eps).sqrt() return conv.weight * (bn.weight / std).reshape(-1, 1, 1, 1), bn.bias + (conv_bias - bn.running_mean) * bn.weight / std def convert_dilated_to_nondilated(kernel, dilate_rate): identity_kernel = torch.ones((1, 1, 1, 1)).to(kernel.device) if kernel.size(1) == 1: # This is a DW kernel dilated = F.conv_transpose2d(kernel, identity_kernel, stride=dilate_rate) return dilated else: # This is a dense or group-wise (but not DW) kernel slices = [] for i in range(kernel.size(1)): dilated = F.conv_transpose2d(kernel[:,i:i+1,:,:], identity_kernel, stride=dilate_rate) slices.append(dilated) return torch.cat(slices, dim=1) def merge_dilated_into_large_kernel(large_kernel, dilated_kernel, dilated_r): large_k = large_kernel.size(2) dilated_k = dilated_kernel.size(2) equivalent_kernel_size = dilated_r * (dilated_k - 1) + 1 equivalent_kernel = convert_dilated_to_nondilated(dilated_kernel, dilated_r) rows_to_pad = large_k // 2 - equivalent_kernel_size // 2 merged_kernel = large_kernel + F.pad(equivalent_kernel, [rows_to_pad] * 4) return merged_kernel class DilatedReparamBlock(nn.Module): """ Dilated Reparam Block proposed in UniRepLKNet (https://github.com/AILab-CVC/UniRepLKNet) We assume the inputs to this block are (N, C, H, W) """ def __init__(self, channels, kernel_size, deploy, use_sync_bn=False, attempt_use_lk_impl=True): super().__init__() self.lk_origin = get_conv2d(channels, channels, kernel_size, stride=1, padding=kernel_size//2, dilation=1, groups=channels, bias=deploy, attempt_use_lk_impl=attempt_use_lk_impl) self.attempt_use_lk_impl = attempt_use_lk_impl # Default settings. We did not tune them carefully. Different settings may work better. if kernel_size == 17: self.kernel_sizes = [5, 9, 3, 3, 3] self.dilates = [1, 2, 4, 5, 7] elif kernel_size == 15: self.kernel_sizes = [5, 7, 3, 3, 3] self.dilates = [1, 2, 3, 5, 7] elif kernel_size == 13: self.kernel_sizes = [5, 7, 3, 3, 3] self.dilates = [1, 2, 3, 4, 5] elif kernel_size == 11: self.kernel_sizes = [5, 5, 3, 3, 3] self.dilates = [1, 2, 3, 4, 5] elif kernel_size == 9: self.kernel_sizes = [5, 5, 3, 3] self.dilates = [1, 2, 3, 4] elif kernel_size == 7: self.kernel_sizes = [5, 3, 3] self.dilates = [1, 2, 3] elif kernel_size == 5: self.kernel_sizes = [3, 3] self.dilates = [1, 2] else: raise ValueError('Dilated Reparam Block requires kernel_size >= 5') if not deploy: self.origin_bn = get_bn(channels, use_sync_bn) for k, r in zip(self.kernel_sizes, self.dilates): self.__setattr__('dil_conv_k{}_{}'.format(k, r), nn.Conv2d(in_channels=channels, out_channels=channels, kernel_size=k, stride=1, padding=(r * (k - 1) + 1) // 2, dilation=r, groups=channels, bias=False)) self.__setattr__('dil_bn_k{}_{}'.format(k, r), get_bn(channels, use_sync_bn=use_sync_bn)) def forward(self, x): if not hasattr(self, 'origin_bn'): # deploy mode return self.lk_origin(x) out = self.origin_bn(self.lk_origin(x)) for k, r in zip(self.kernel_sizes, self.dilates): conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r)) bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r)) out = out + bn(conv(x)) return out def merge_dilated_branches(self): if hasattr(self, 'origin_bn'): origin_k, origin_b = fuse_bn(self.lk_origin, self.origin_bn) for k, r in zip(self.kernel_sizes, self.dilates): conv = self.__getattr__('dil_conv_k{}_{}'.format(k, r)) bn = self.__getattr__('dil_bn_k{}_{}'.format(k, r)) branch_k, branch_b = fuse_bn(conv, bn) origin_k = merge_dilated_into_large_kernel(origin_k, branch_k, r) origin_b += branch_b merged_conv = get_conv2d(origin_k.size(0), origin_k.size(0), origin_k.size(2), stride=1, padding=origin_k.size(2)//2, dilation=1, groups=origin_k.size(0), bias=True, attempt_use_lk_impl=self.attempt_use_lk_impl) merged_conv.weight.data = origin_k merged_conv.bias.data = origin_b self.lk_origin = merged_conv self.__delattr__('origin_bn') for k, r in zip(self.kernel_sizes, self.dilates): self.__delattr__('dil_conv_k{}_{}'.format(k, r)) self.__delattr__('dil_bn_k{}_{}'.format(k, r)) class UniRepLKNetBlock(nn.Module): def __init__(self, dim, kernel_size, drop_path=0., layer_scale_init_value=1e-6, deploy=False, attempt_use_lk_impl=True, with_cp=False, use_sync_bn=False, ffn_factor=4): super().__init__() self.with_cp = with_cp # if deploy: # print('------------------------------- Note: deploy mode') # if self.with_cp: # print('****** note with_cp = True, reduce memory consumption but may slow down training ******') self.need_contiguous = (not deploy) or kernel_size >= 7 if kernel_size == 0: self.dwconv = nn.Identity() self.norm = nn.Identity() elif deploy: self.dwconv = get_conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2, dilation=1, groups=dim, bias=True, attempt_use_lk_impl=attempt_use_lk_impl) self.norm = nn.Identity() elif kernel_size >= 7: self.dwconv = DilatedReparamBlock(dim, kernel_size, deploy=deploy, use_sync_bn=use_sync_bn, attempt_use_lk_impl=attempt_use_lk_impl) self.norm = get_bn(dim, use_sync_bn=use_sync_bn) elif kernel_size == 1: self.dwconv = nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2, dilation=1, groups=1, bias=deploy) self.norm = get_bn(dim, use_sync_bn=use_sync_bn) else: assert kernel_size in [3, 5] self.dwconv = nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=kernel_size // 2, dilation=1, groups=dim, bias=deploy) self.norm = get_bn(dim, use_sync_bn=use_sync_bn) self.se = SEBlock(dim, dim // 4) ffn_dim = int(ffn_factor * dim) self.pwconv1 = nn.Sequential( NCHWtoNHWC(), nn.Linear(dim, ffn_dim)) self.act = nn.Sequential( nn.GELU(), GRNwithNHWC(ffn_dim, use_bias=not deploy)) if deploy: self.pwconv2 = nn.Sequential( nn.Linear(ffn_dim, dim), NHWCtoNCHW()) else: self.pwconv2 = nn.Sequential( nn.Linear(ffn_dim, dim, bias=False), NHWCtoNCHW(), get_bn(dim, use_sync_bn=use_sync_bn)) self.gamma = nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) if (not deploy) and layer_scale_init_value is not None \ and layer_scale_init_value > 0 else None self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() def forward(self, inputs): def _f(x): if self.need_contiguous: x = x.contiguous() y = self.se(self.norm(self.dwconv(x))) y = self.pwconv2(self.act(self.pwconv1(y))) if self.gamma is not None: y = self.gamma.view(1, -1, 1, 1) * y return self.drop_path(y) + x if self.with_cp and inputs.requires_grad: return checkpoint.checkpoint(_f, inputs) else: return _f(inputs) def reparameterize(self): if hasattr(self.dwconv, 'merge_dilated_branches'): self.dwconv.merge_dilated_branches() if hasattr(self.norm, 'running_var') and hasattr(self.dwconv, 'lk_origin'): std = (self.norm.running_var + self.norm.eps).sqrt() self.dwconv.lk_origin.weight.data *= (self.norm.weight / std).view(-1, 1, 1, 1) self.dwconv.lk_origin.bias.data = self.norm.bias + (self.dwconv.lk_origin.bias - self.norm.running_mean) * self.norm.weight / std self.norm = nn.Identity() if self.gamma is not None: final_scale = self.gamma.data self.gamma = None else: final_scale = 1 if self.act[1].use_bias and len(self.pwconv2) == 3: grn_bias = self.act[1].beta.data self.act[1].__delattr__('beta') self.act[1].use_bias = False linear = self.pwconv2[0] grn_bias_projected_bias = (linear.weight.data @ grn_bias.view(-1, 1)).squeeze() bn = self.pwconv2[2] std = (bn.running_var + bn.eps).sqrt() new_linear = nn.Linear(linear.in_features, linear.out_features, bias=True) new_linear.weight.data = linear.weight * (bn.weight / std * final_scale).view(-1, 1) linear_bias = 0 if linear.bias is None else linear.bias.data linear_bias += grn_bias_projected_bias new_linear.bias.data = (bn.bias + (linear_bias - bn.running_mean) * bn.weight / std) * final_scale self.pwconv2 = nn.Sequential(new_linear, self.pwconv2[1]) default_UniRepLKNet_A_F_P_kernel_sizes = ((3, 3), (13, 13), (13, 13, 13, 13, 13, 13), (13, 13)) default_UniRepLKNet_N_kernel_sizes = ((3, 3), (13, 13), (13, 13, 13, 13, 13, 13, 13, 13), (13, 13)) default_UniRepLKNet_T_kernel_sizes = ((3, 3, 3), (13, 13, 13), (13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3), (13, 13, 13)) default_UniRepLKNet_S_B_L_XL_kernel_sizes = ((3, 3, 3), (13, 13, 13), (13, 3, 3, 13, 3, 3, 13, 3, 3, 13, 3, 3, 13, 3, 3, 13, 3, 3, 13, 3, 3, 13, 3, 3, 13, 3, 3), (13, 13, 13)) UniRepLKNet_A_F_P_depths = (2, 2, 6, 2) UniRepLKNet_N_depths = (2, 2, 8, 2) UniRepLKNet_T_depths = (3, 3, 18, 3) UniRepLKNet_S_B_L_XL_depths = (3, 3, 27, 3) default_depths_to_kernel_sizes = { UniRepLKNet_A_F_P_depths: default_UniRepLKNet_A_F_P_kernel_sizes, UniRepLKNet_N_depths: default_UniRepLKNet_N_kernel_sizes, UniRepLKNet_T_depths: default_UniRepLKNet_T_kernel_sizes, UniRepLKNet_S_B_L_XL_depths: default_UniRepLKNet_S_B_L_XL_kernel_sizes } class UniRepLKNet(nn.Module): r""" UniRepLKNet A PyTorch impl of UniRepLKNet Args: in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 depths (tuple(int)): Number of blocks at each stage. Default: (3, 3, 27, 3) dims (int): Feature dimension at each stage. Default: (96, 192, 384, 768) drop_path_rate (float): Stochastic depth rate. Default: 0. layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. kernel_sizes (tuple(tuple(int))): Kernel size for each block. None means using the default settings. Default: None. deploy (bool): deploy = True means using the inference structure. Default: False with_cp (bool): with_cp = True means using torch.utils.checkpoint to save GPU memory. Default: False init_cfg (dict): weights to load. The easiest way to use UniRepLKNet with for OpenMMLab family. Default: None attempt_use_lk_impl (bool): try to load the efficient iGEMM large-kernel impl. Setting it to False disabling the iGEMM impl. Default: True use_sync_bn (bool): use_sync_bn = True means using sync BN. Use it if your batch size is small. Default: False """ def __init__(self, in_chans=3, num_classes=1000, depths=(3, 3, 27, 3), dims=(96, 192, 384, 768), drop_path_rate=0., layer_scale_init_value=1e-6, head_init_scale=1., kernel_sizes=None, deploy=False, with_cp=False, init_cfg=None, attempt_use_lk_impl=True, use_sync_bn=False, **kwargs ): super().__init__() depths = tuple(depths) if kernel_sizes is None: if depths in default_depths_to_kernel_sizes: # print('=========== use default kernel size ') kernel_sizes = default_depths_to_kernel_sizes[depths] else: raise ValueError('no default kernel size settings for the given depths, ' 'please specify kernel sizes for each block, e.g., ' '((3, 3), (13, 13), (13, 13, 13, 13, 13, 13), (13, 13))') # print(kernel_sizes) for i in range(4): assert len(kernel_sizes[i]) == depths[i], 'kernel sizes do not match the depths' self.with_cp = with_cp dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # print('=========== drop path rates: ', dp_rates) self.downsample_layers = nn.ModuleList() self.downsample_layers.append(nn.Sequential( nn.Conv2d(in_chans, dims[0] // 2, kernel_size=3, stride=2, padding=1), LayerNorm(dims[0] // 2, eps=1e-6, data_format="channels_first"), nn.GELU(), nn.Conv2d(dims[0] // 2, dims[0], kernel_size=3, stride=2, padding=1), LayerNorm(dims[0], eps=1e-6, data_format="channels_first"))) for i in range(3): self.downsample_layers.append(nn.Sequential( nn.Conv2d(dims[i], dims[i + 1], kernel_size=3, stride=2, padding=1), LayerNorm(dims[i + 1], eps=1e-6, data_format="channels_first"))) self.stages = nn.ModuleList() cur = 0 for i in range(4): main_stage = nn.Sequential( *[UniRepLKNetBlock(dim=dims[i], kernel_size=kernel_sizes[i][j], drop_path=dp_rates[cur + j], layer_scale_init_value=layer_scale_init_value, deploy=deploy, attempt_use_lk_impl=attempt_use_lk_impl, with_cp=with_cp, use_sync_bn=use_sync_bn) for j in range(depths[i])]) self.stages.append(main_stage) cur += depths[i] self.output_mode = 'features' norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") for i_layer in range(4): layer = norm_layer(dims[i_layer]) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, (nn.Conv2d, nn.Linear)): trunc_normal_(m.weight, std=.02) if hasattr(m, 'bias') and m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x): if self.output_mode == 'logits': for stage_idx in range(4): x = self.downsample_layers[stage_idx](x) x = self.stages[stage_idx](x) x = self.norm(x.mean([-2, -1])) x = self.head(x) return x elif self.output_mode == 'features': outs = [] for stage_idx in range(4): x = self.downsample_layers[stage_idx](x) x = self.stages[stage_idx](x) outs.append(self.__getattr__(f'norm{stage_idx}')(x)) return outs else: raise ValueError('Defined new output mode?') def switch_to_deploy(self): for m in self.modules(): if hasattr(m, 'reparameterize'): m.reparameterize() class LayerNorm(nn.Module): r""" LayerNorm implementation used in ConvNeXt LayerNorm that supports two data formats: channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). """ def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last", reshape_last_to_first=False): super().__init__() self.weight = nn.Parameter(torch.ones(normalized_shape)) self.bias = nn.Parameter(torch.zeros(normalized_shape)) self.eps = eps self.data_format = data_format if self.data_format not in ["channels_last", "channels_first"]: raise NotImplementedError self.normalized_shape = (normalized_shape,) self.reshape_last_to_first = reshape_last_to_first def forward(self, x): if self.data_format == "channels_last": return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) elif self.data_format == "channels_first": u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / torch.sqrt(s + self.eps) x = self.weight[:, None, None] * x + self.bias[:, None, None] return x def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def unireplknet_a(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_A_F_P_depths, dims=(40, 80, 160, 320), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_f(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_A_F_P_depths, dims=(48, 96, 192, 384), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_p(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_A_F_P_depths, dims=(64, 128, 256, 512), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_n(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_N_depths, dims=(80, 160, 320, 640), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_t(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_T_depths, dims=(80, 160, 320, 640), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_s(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_S_B_L_XL_depths, dims=(96, 192, 384, 768), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_b(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_S_B_L_XL_depths, dims=(128, 256, 512, 1024), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_l(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_S_B_L_XL_depths, dims=(192, 384, 768, 1536), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model def unireplknet_xl(weights='', **kwargs): model = UniRepLKNet(depths=UniRepLKNet_S_B_L_XL_depths, dims=(256, 512, 1024, 2048), **kwargs) if weights: model.load_state_dict(update_weight(model.state_dict(), torch.load(weights))) return model if __name__ == '__main__': inputs = torch.randn((1, 3, 640, 640)) model = unireplknet_a('unireplknet_a_in1k_224_acc77.03.pth') res = model(inputs)[-1] model.switch_to_deploy() res_fuse = model(inputs)[-1] print(torch.mean(res_fuse - res)) ================================================ FILE: yolo-improve/yolov5-backbone/VanillaNet/VanillaNet.py ================================================ #Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. #This program is free software; you can redistribute it and/or modify it under the terms of the MIT License. #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MIT License for more details. import torch import torch.nn as nn import torch.nn.functional as F from timm.models.layers import weight_init, DropPath import numpy as np __all__ = ['vanillanet_5', 'vanillanet_6', 'vanillanet_7', 'vanillanet_8', 'vanillanet_9', 'vanillanet_10', 'vanillanet_11', 'vanillanet_12', 'vanillanet_13', 'vanillanet_13_x1_5', 'vanillanet_13_x1_5_ada_pool'] class activation(nn.ReLU): def __init__(self, dim, act_num=3, deploy=False): super(activation, self).__init__() self.deploy = deploy self.weight = torch.nn.Parameter(torch.randn(dim, 1, act_num*2 + 1, act_num*2 + 1)) self.bias = None self.bn = nn.BatchNorm2d(dim, eps=1e-6) self.dim = dim self.act_num = act_num weight_init.trunc_normal_(self.weight, std=.02) def forward(self, x): if self.deploy: return torch.nn.functional.conv2d( super(activation, self).forward(x), self.weight, self.bias, padding=(self.act_num*2 + 1)//2, groups=self.dim) else: return self.bn(torch.nn.functional.conv2d( super(activation, self).forward(x), self.weight, padding=self.act_num, groups=self.dim)) def _fuse_bn_tensor(self, weight, bn): kernel = weight running_mean = bn.running_mean running_var = bn.running_var gamma = bn.weight beta = bn.bias eps = bn.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta + (0 - running_mean) * gamma / std def switch_to_deploy(self): if not self.deploy: kernel, bias = self._fuse_bn_tensor(self.weight, self.bn) self.weight.data = kernel self.bias = torch.nn.Parameter(torch.zeros(self.dim)) self.bias.data = bias self.__delattr__('bn') self.deploy = True class Block(nn.Module): def __init__(self, dim, dim_out, act_num=3, stride=2, deploy=False, ada_pool=None): super().__init__() self.act_learn = 1 self.deploy = deploy if self.deploy: self.conv = nn.Conv2d(dim, dim_out, kernel_size=1) else: self.conv1 = nn.Sequential( nn.Conv2d(dim, dim, kernel_size=1), nn.BatchNorm2d(dim, eps=1e-6), ) self.conv2 = nn.Sequential( nn.Conv2d(dim, dim_out, kernel_size=1), nn.BatchNorm2d(dim_out, eps=1e-6) ) if not ada_pool: self.pool = nn.Identity() if stride == 1 else nn.MaxPool2d(stride) else: self.pool = nn.Identity() if stride == 1 else nn.AdaptiveMaxPool2d((ada_pool, ada_pool)) self.act = activation(dim_out, act_num) def forward(self, x): if self.deploy: x = self.conv(x) else: x = self.conv1(x) x = torch.nn.functional.leaky_relu(x,self.act_learn) x = self.conv2(x) x = self.pool(x) x = self.act(x) return x def _fuse_bn_tensor(self, conv, bn): kernel = conv.weight bias = conv.bias running_mean = bn.running_mean running_var = bn.running_var gamma = bn.weight beta = bn.bias eps = bn.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta + (bias - running_mean) * gamma / std def switch_to_deploy(self): if not self.deploy: kernel, bias = self._fuse_bn_tensor(self.conv1[0], self.conv1[1]) self.conv1[0].weight.data = kernel self.conv1[0].bias.data = bias # kernel, bias = self.conv2[0].weight.data, self.conv2[0].bias.data kernel, bias = self._fuse_bn_tensor(self.conv2[0], self.conv2[1]) self.conv = self.conv2[0] self.conv.weight.data = torch.matmul(kernel.transpose(1,3), self.conv1[0].weight.data.squeeze(3).squeeze(2)).transpose(1,3) self.conv.bias.data = bias + (self.conv1[0].bias.data.view(1,-1,1,1)*kernel).sum(3).sum(2).sum(1) self.__delattr__('conv1') self.__delattr__('conv2') self.act.switch_to_deploy() self.deploy = True class VanillaNet(nn.Module): def __init__(self, in_chans=3, num_classes=1000, dims=[96, 192, 384, 768], drop_rate=0, act_num=3, strides=[2,2,2,1], deploy=False, ada_pool=None, **kwargs): super().__init__() self.deploy = deploy if self.deploy: self.stem = nn.Sequential( nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), activation(dims[0], act_num) ) else: self.stem1 = nn.Sequential( nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), nn.BatchNorm2d(dims[0], eps=1e-6), ) self.stem2 = nn.Sequential( nn.Conv2d(dims[0], dims[0], kernel_size=1, stride=1), nn.BatchNorm2d(dims[0], eps=1e-6), activation(dims[0], act_num) ) self.act_learn = 1 self.stages = nn.ModuleList() for i in range(len(strides)): if not ada_pool: stage = Block(dim=dims[i], dim_out=dims[i+1], act_num=act_num, stride=strides[i], deploy=deploy) else: stage = Block(dim=dims[i], dim_out=dims[i+1], act_num=act_num, stride=strides[i], deploy=deploy, ada_pool=ada_pool[i]) self.stages.append(stage) self.depth = len(strides) self.apply(self._init_weights) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def _init_weights(self, m): if isinstance(m, (nn.Conv2d, nn.Linear)): weight_init.trunc_normal_(m.weight, std=.02) nn.init.constant_(m.bias, 0) def change_act(self, m): for i in range(self.depth): self.stages[i].act_learn = m self.act_learn = m def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] features = [None, None, None, None] if self.deploy: x = self.stem(x) else: x = self.stem1(x) x = torch.nn.functional.leaky_relu(x,self.act_learn) x = self.stem2(x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x for i in range(self.depth): x = self.stages[i](x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x return features def _fuse_bn_tensor(self, conv, bn): kernel = conv.weight bias = conv.bias running_mean = bn.running_mean running_var = bn.running_var gamma = bn.weight beta = bn.bias eps = bn.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta + (bias - running_mean) * gamma / std def switch_to_deploy(self): if not self.deploy: self.stem2[2].switch_to_deploy() kernel, bias = self._fuse_bn_tensor(self.stem1[0], self.stem1[1]) self.stem1[0].weight.data = kernel self.stem1[0].bias.data = bias kernel, bias = self._fuse_bn_tensor(self.stem2[0], self.stem2[1]) self.stem1[0].weight.data = torch.einsum('oi,icjk->ocjk', kernel.squeeze(3).squeeze(2), self.stem1[0].weight.data) self.stem1[0].bias.data = bias + (self.stem1[0].bias.data.view(1,-1,1,1)*kernel).sum(3).sum(2).sum(1) self.stem = torch.nn.Sequential(*[self.stem1[0], self.stem2[2]]) self.__delattr__('stem1') self.__delattr__('stem2') for i in range(self.depth): self.stages[i].switch_to_deploy() self.deploy = True def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def vanillanet_5(pretrained='',in_22k=False, **kwargs): model = VanillaNet(dims=[128*4, 256*4, 512*4, 1024*4], strides=[2,2,2], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_6(pretrained='',in_22k=False, **kwargs): model = VanillaNet(dims=[128*4, 256*4, 512*4, 1024*4, 1024*4], strides=[2,2,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_7(pretrained='',in_22k=False, **kwargs): model = VanillaNet(dims=[128*4, 128*4, 256*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_8(pretrained='', in_22k=False, **kwargs): model = VanillaNet(dims=[128*4, 128*4, 256*4, 512*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_9(pretrained='', in_22k=False, **kwargs): model = VanillaNet(dims=[128*4, 128*4, 256*4, 512*4, 512*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,1,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_10(pretrained='', in_22k=False, **kwargs): model = VanillaNet( dims=[128*4, 128*4, 256*4, 512*4, 512*4, 512*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,1,1,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_11(pretrained='', in_22k=False, **kwargs): model = VanillaNet( dims=[128*4, 128*4, 256*4, 512*4, 512*4, 512*4, 512*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,1,1,1,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_12(pretrained='', in_22k=False, **kwargs): model = VanillaNet( dims=[128*4, 128*4, 256*4, 512*4, 512*4, 512*4, 512*4, 512*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,1,1,1,1,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_13(pretrained='', in_22k=False, **kwargs): model = VanillaNet( dims=[128*4, 128*4, 256*4, 512*4, 512*4, 512*4, 512*4, 512*4, 512*4, 512*4, 1024*4, 1024*4], strides=[1,2,2,1,1,1,1,1,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_13_x1_5(pretrained='', in_22k=False, **kwargs): model = VanillaNet( dims=[128*6, 128*6, 256*6, 512*6, 512*6, 512*6, 512*6, 512*6, 512*6, 512*6, 1024*6, 1024*6], strides=[1,2,2,1,1,1,1,1,1,2,1], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model def vanillanet_13_x1_5_ada_pool(pretrained='', in_22k=False, **kwargs): model = VanillaNet( dims=[128*6, 128*6, 256*6, 512*6, 512*6, 512*6, 512*6, 512*6, 512*6, 512*6, 1024*6, 1024*6], strides=[1,2,2,1,1,1,1,1,1,2,1], ada_pool=[0,40,20,0,0,0,0,0,0,10,0], **kwargs) if pretrained: weights = torch.load(pretrained)['model_ema'] model.load_state_dict(update_weight(model.state_dict(), weights)) return model if __name__ == '__main__': inputs = torch.randn((1, 3, 640, 640)) model = vanillanet_10() # weights = torch.load('vanillanet_5.pth')['model_ema'] # model.load_state_dict(update_weight(model.state_dict(), weights)) pred = model(inputs) for i in pred: print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/faster_cfg/fasternet_l.yaml ================================================ mlp_ratio: 2 embed_dim: 192 depths: [3, 4, 18, 3] feature_dim: 1280 patch_size: 4 patch_stride: 4 patch_size2: 2 patch_stride2: 2 layer_scale_init_value: 0 # no layer scale drop_path_rate: 0.3 norm_layer: BN act_layer: RELU n_div: 4 ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/faster_cfg/fasternet_m.yaml ================================================ mlp_ratio: 2 embed_dim: 144 depths: [3, 4, 18, 3] feature_dim: 1280 patch_size: 4 patch_stride: 4 patch_size2: 2 patch_stride2: 2 layer_scale_init_value: 0 # no layer scale drop_path_rate: 0.2 norm_layer: BN act_layer: RELU n_div: 4 ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/faster_cfg/fasternet_s.yaml ================================================ mlp_ratio: 2 embed_dim: 128 depths: [1, 2, 13, 2] feature_dim: 1280 patch_size: 4 patch_stride: 4 patch_size2: 2 patch_stride2: 2 layer_scale_init_value: 0 # no layer scale drop_path_rate: 0.1 norm_layer: BN act_layer: RELU n_div: 4 ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/faster_cfg/fasternet_t0.yaml ================================================ mlp_ratio: 2 embed_dim: 40 depths: [1, 2, 8, 2] feature_dim: 1280 patch_size: 4 patch_stride: 4 patch_size2: 2 patch_stride2: 2 layer_scale_init_value: 0 # no layer scale drop_path_rate: 0. norm_layer: BN act_layer: GELU n_div: 4 ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/faster_cfg/fasternet_t1.yaml ================================================ mlp_ratio: 2 embed_dim: 64 depths: [1, 2, 8, 2] feature_dim: 1280 patch_size: 4 patch_stride: 4 patch_size2: 2 patch_stride2: 2 layer_scale_init_value: 0 # no layer scale drop_path_rate: 0.02 norm_layer: BN act_layer: GELU n_div: 4 ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/faster_cfg/fasternet_t2.yaml ================================================ mlp_ratio: 2 embed_dim: 96 depths: [1, 2, 8, 2] feature_dim: 1280 patch_size: 4 patch_stride: 4 patch_size2: 2 patch_stride2: 2 layer_scale_init_value: 0 # no layer scale drop_path_rate: 0.05 norm_layer: BN act_layer: RELU n_div: 4 ================================================ FILE: yolo-improve/yolov5-backbone/fasternet/fasternet.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import torch, yaml import torch.nn as nn from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from functools import partial from typing import List from torch import Tensor import copy import os import numpy as np __all__ = ['fasternet_t0', 'fasternet_t1', 'fasternet_t2', 'fasternet_s', 'fasternet_m', 'fasternet_l'] class Partial_conv3(nn.Module): def __init__(self, dim, n_div, forward): super().__init__() self.dim_conv3 = dim // n_div self.dim_untouched = dim - self.dim_conv3 self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, 3, 1, 1, bias=False) if forward == 'slicing': self.forward = self.forward_slicing elif forward == 'split_cat': self.forward = self.forward_split_cat else: raise NotImplementedError def forward_slicing(self, x: Tensor) -> Tensor: # only for inference x = x.clone() # !!! Keep the original input intact for the residual connection later x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :]) return x def forward_split_cat(self, x: Tensor) -> Tensor: # for training/inference x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1) x1 = self.partial_conv3(x1) x = torch.cat((x1, x2), 1) return x class MLPBlock(nn.Module): def __init__(self, dim, n_div, mlp_ratio, drop_path, layer_scale_init_value, act_layer, norm_layer, pconv_fw_type ): super().__init__() self.dim = dim self.mlp_ratio = mlp_ratio self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.n_div = n_div mlp_hidden_dim = int(dim * mlp_ratio) mlp_layer: List[nn.Module] = [ nn.Conv2d(dim, mlp_hidden_dim, 1, bias=False), norm_layer(mlp_hidden_dim), act_layer(), nn.Conv2d(mlp_hidden_dim, dim, 1, bias=False) ] self.mlp = nn.Sequential(*mlp_layer) self.spatial_mixing = Partial_conv3( dim, n_div, pconv_fw_type ) if layer_scale_init_value > 0: self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) self.forward = self.forward_layer_scale else: self.forward = self.forward def forward(self, x: Tensor) -> Tensor: shortcut = x x = self.spatial_mixing(x) x = shortcut + self.drop_path(self.mlp(x)) return x def forward_layer_scale(self, x: Tensor) -> Tensor: shortcut = x x = self.spatial_mixing(x) x = shortcut + self.drop_path( self.layer_scale.unsqueeze(-1).unsqueeze(-1) * self.mlp(x)) return x class BasicStage(nn.Module): def __init__(self, dim, depth, n_div, mlp_ratio, drop_path, layer_scale_init_value, norm_layer, act_layer, pconv_fw_type ): super().__init__() blocks_list = [ MLPBlock( dim=dim, n_div=n_div, mlp_ratio=mlp_ratio, drop_path=drop_path[i], layer_scale_init_value=layer_scale_init_value, norm_layer=norm_layer, act_layer=act_layer, pconv_fw_type=pconv_fw_type ) for i in range(depth) ] self.blocks = nn.Sequential(*blocks_list) def forward(self, x: Tensor) -> Tensor: x = self.blocks(x) return x class PatchEmbed(nn.Module): def __init__(self, patch_size, patch_stride, in_chans, embed_dim, norm_layer): super().__init__() self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, bias=False) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = nn.Identity() def forward(self, x: Tensor) -> Tensor: x = self.norm(self.proj(x)) return x class PatchMerging(nn.Module): def __init__(self, patch_size2, patch_stride2, dim, norm_layer): super().__init__() self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=patch_size2, stride=patch_stride2, bias=False) if norm_layer is not None: self.norm = norm_layer(2 * dim) else: self.norm = nn.Identity() def forward(self, x: Tensor) -> Tensor: x = self.norm(self.reduction(x)) return x class FasterNet(nn.Module): def __init__(self, in_chans=3, num_classes=1000, embed_dim=96, depths=(1, 2, 8, 2), mlp_ratio=2., n_div=4, patch_size=4, patch_stride=4, patch_size2=2, # for subsequent layers patch_stride2=2, patch_norm=True, feature_dim=1280, drop_path_rate=0.1, layer_scale_init_value=0, norm_layer='BN', act_layer='RELU', init_cfg=None, pretrained=None, pconv_fw_type='split_cat', **kwargs): super().__init__() if norm_layer == 'BN': norm_layer = nn.BatchNorm2d else: raise NotImplementedError if act_layer == 'GELU': act_layer = nn.GELU elif act_layer == 'RELU': act_layer = partial(nn.ReLU, inplace=True) else: raise NotImplementedError self.num_stages = len(depths) self.embed_dim = embed_dim self.patch_norm = patch_norm self.num_features = int(embed_dim * 2 ** (self.num_stages - 1)) self.mlp_ratio = mlp_ratio self.depths = depths # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, patch_stride=patch_stride, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None ) # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # build layers stages_list = [] for i_stage in range(self.num_stages): stage = BasicStage(dim=int(embed_dim * 2 ** i_stage), n_div=n_div, depth=depths[i_stage], mlp_ratio=self.mlp_ratio, drop_path=dpr[sum(depths[:i_stage]):sum(depths[:i_stage + 1])], layer_scale_init_value=layer_scale_init_value, norm_layer=norm_layer, act_layer=act_layer, pconv_fw_type=pconv_fw_type ) stages_list.append(stage) # patch merging layer if i_stage < self.num_stages - 1: stages_list.append( PatchMerging(patch_size2=patch_size2, patch_stride2=patch_stride2, dim=int(embed_dim * 2 ** i_stage), norm_layer=norm_layer) ) self.stages = nn.Sequential(*stages_list) # add a norm layer for each output self.out_indices = [0, 2, 4, 6] for i_emb, i_layer in enumerate(self.out_indices): if i_emb == 0 and os.environ.get('FORK_LAST3', None): raise NotImplementedError else: layer = norm_layer(int(embed_dim * 2 ** i_emb)) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] def forward(self, x: Tensor) -> Tensor: # output the features of four stages for dense prediction x = self.patch_embed(x) outs = [] for idx, stage in enumerate(self.stages): x = stage(x) if idx in self.out_indices: norm_layer = getattr(self, f'norm{idx}') x_out = norm_layer(x) outs.append(x_out) return outs def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict def fasternet_t0(weights=None, cfg='models/faster_cfg/fasternet_t0.yaml'): with open(cfg) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) model = FasterNet(**cfg) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu') model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def fasternet_t1(weights=None, cfg='models/faster_cfg/fasternet_t1.yaml'): with open(cfg) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) model = FasterNet(**cfg) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu') model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def fasternet_t2(weights=None, cfg='models/faster_cfg/fasternet_t2.yaml'): with open(cfg) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) model = FasterNet(**cfg) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu') model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def fasternet_s(weights=None, cfg='models/faster_cfg/fasternet_s.yaml'): with open(cfg) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) model = FasterNet(**cfg) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu') model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def fasternet_m(weights=None, cfg='models/faster_cfg/fasternet_m.yaml'): with open(cfg) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) model = FasterNet(**cfg) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu') model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model def fasternet_l(weights=None, cfg='models/faster_cfg/fasternet_l.yaml'): with open(cfg) as f: cfg = yaml.load(f, Loader=yaml.SafeLoader) model = FasterNet(**cfg) if weights is not None: pretrain_weight = torch.load(weights, map_location='cpu') model.load_state_dict(update_weight(model.state_dict(), pretrain_weight)) return model if __name__ == '__main__': import yaml model = fasternet_t0(weights='fasternet_t0-epoch.281-val_acc1.71.9180.pth', cfg='cfg/fasternet_t0.yaml') print(model.channel) inputs = torch.randn((1, 3, 640, 640)) for i in model(inputs): print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/inceptionnext/inceptionnext.py ================================================ """ InceptionNeXt implementation, paper: https://arxiv.org/abs/2303.16900 Some code is borrowed from timm: https://github.com/huggingface/pytorch-image-models """ from functools import partial import torch import torch.nn as nn import numpy as np from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from timm.models import checkpoint_seq, to_2tuple from timm.models.layers import trunc_normal_, DropPath from timm.models.registry import register_model __all__ = ['inceptionnext_tiny', 'inceptionnext_small', 'inceptionnext_base', 'inceptionnext_base_384'] class InceptionDWConv2d(nn.Module): """ Inception depthweise convolution """ def __init__(self, in_channels, square_kernel_size=3, band_kernel_size=11, branch_ratio=0.125): super().__init__() gc = int(in_channels * branch_ratio) # channel numbers of a convolution branch self.dwconv_hw = nn.Conv2d(gc, gc, square_kernel_size, padding=square_kernel_size//2, groups=gc) self.dwconv_w = nn.Conv2d(gc, gc, kernel_size=(1, band_kernel_size), padding=(0, band_kernel_size//2), groups=gc) self.dwconv_h = nn.Conv2d(gc, gc, kernel_size=(band_kernel_size, 1), padding=(band_kernel_size//2, 0), groups=gc) self.split_indexes = (in_channels - 3 * gc, gc, gc, gc) def forward(self, x): x_id, x_hw, x_w, x_h = torch.split(x, self.split_indexes, dim=1) return torch.cat( (x_id, self.dwconv_hw(x_hw), self.dwconv_w(x_w), self.dwconv_h(x_h)), dim=1, ) class ConvMlp(nn.Module): """ MLP using 1x1 convs that keeps spatial dims copied from timm: https://github.com/huggingface/pytorch-image-models/blob/v0.6.11/timm/models/layers/mlp.py """ def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, norm_layer=None, bias=True, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features bias = to_2tuple(bias) self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0]) self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity() self.act = act_layer() self.drop = nn.Dropout(drop) self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1]) def forward(self, x): x = self.fc1(x) x = self.norm(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) return x class MlpHead(nn.Module): """ MLP classification head """ def __init__(self, dim, num_classes=1000, mlp_ratio=3, act_layer=nn.GELU, norm_layer=partial(nn.LayerNorm, eps=1e-6), drop=0., bias=True): super().__init__() hidden_features = int(mlp_ratio * dim) self.fc1 = nn.Linear(dim, hidden_features, bias=bias) self.act = act_layer() self.norm = norm_layer(hidden_features) self.fc2 = nn.Linear(hidden_features, num_classes, bias=bias) self.drop = nn.Dropout(drop) def forward(self, x): x = x.mean((2, 3)) # global average pooling x = self.fc1(x) x = self.act(x) x = self.norm(x) x = self.drop(x) x = self.fc2(x) return x class MetaNeXtBlock(nn.Module): """ MetaNeXtBlock Block Args: dim (int): Number of input channels. drop_path (float): Stochastic depth rate. Default: 0.0 ls_init_value (float): Init value for Layer Scale. Default: 1e-6. """ def __init__( self, dim, token_mixer=InceptionDWConv2d, norm_layer=nn.BatchNorm2d, mlp_layer=ConvMlp, mlp_ratio=4, act_layer=nn.GELU, ls_init_value=1e-6, drop_path=0., ): super().__init__() self.token_mixer = token_mixer(dim) self.norm = norm_layer(dim) self.mlp = mlp_layer(dim, int(mlp_ratio * dim), act_layer=act_layer) self.gamma = nn.Parameter(ls_init_value * torch.ones(dim)) if ls_init_value else None self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() def forward(self, x): shortcut = x x = self.token_mixer(x) x = self.norm(x) x = self.mlp(x) if self.gamma is not None: x = x.mul(self.gamma.reshape(1, -1, 1, 1)) x = self.drop_path(x) + shortcut return x class MetaNeXtStage(nn.Module): def __init__( self, in_chs, out_chs, ds_stride=2, depth=2, drop_path_rates=None, ls_init_value=1.0, act_layer=nn.GELU, norm_layer=None, mlp_ratio=4, ): super().__init__() self.grad_checkpointing = False if ds_stride > 1: self.downsample = nn.Sequential( norm_layer(in_chs), nn.Conv2d(in_chs, out_chs, kernel_size=ds_stride, stride=ds_stride), ) else: self.downsample = nn.Identity() drop_path_rates = drop_path_rates or [0.] * depth stage_blocks = [] for i in range(depth): stage_blocks.append(MetaNeXtBlock( dim=out_chs, drop_path=drop_path_rates[i], ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, mlp_ratio=mlp_ratio, )) in_chs = out_chs self.blocks = nn.Sequential(*stage_blocks) def forward(self, x): x = self.downsample(x) if self.grad_checkpointing and not torch.jit.is_scripting(): x = checkpoint_seq(self.blocks, x) else: x = self.blocks(x) return x class MetaNeXt(nn.Module): r""" MetaNeXt A PyTorch impl of : `InceptionNeXt: When Inception Meets ConvNeXt` - https://arxiv.org/pdf/2203.xxxxx.pdf Args: in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 depths (tuple(int)): Number of blocks at each stage. Default: (3, 3, 9, 3) dims (tuple(int)): Feature dimension at each stage. Default: (96, 192, 384, 768) token_mixers: Token mixer function. Default: nn.Identity norm_layer: Normalziation layer. Default: nn.BatchNorm2d act_layer: Activation function for MLP. Default: nn.GELU mlp_ratios (int or tuple(int)): MLP ratios. Default: (4, 4, 4, 3) head_fn: classifier head drop_rate (float): Head dropout rate drop_path_rate (float): Stochastic depth rate. Default: 0. ls_init_value (float): Init value for Layer Scale. Default: 1e-6. """ def __init__( self, in_chans=3, num_classes=1000, depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), token_mixers=nn.Identity, norm_layer=nn.BatchNorm2d, act_layer=nn.GELU, mlp_ratios=(4, 4, 4, 3), head_fn=MlpHead, drop_rate=0., drop_path_rate=0., ls_init_value=1e-6, **kwargs, ): super().__init__() num_stage = len(depths) if not isinstance(token_mixers, (list, tuple)): token_mixers = [token_mixers] * num_stage if not isinstance(mlp_ratios, (list, tuple)): mlp_ratios = [mlp_ratios] * num_stage self.num_classes = num_classes self.drop_rate = drop_rate self.stem = nn.Sequential( nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), norm_layer(dims[0]) ) self.stages = nn.Sequential() dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] stages = [] prev_chs = dims[0] # feature resolution stages, each consisting of multiple residual blocks for i in range(num_stage): out_chs = dims[i] stages.append(MetaNeXtStage( prev_chs, out_chs, ds_stride=2 if i > 0 else 1, depth=depths[i], drop_path_rates=dp_rates[i], ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, mlp_ratio=mlp_ratios[i], )) prev_chs = out_chs self.stages = nn.Sequential(*stages) self.num_features = prev_chs self.apply(self._init_weights) self.channel = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))] @torch.jit.ignore def set_grad_checkpointing(self, enable=True): for s in self.stages: s.grad_checkpointing = enable @torch.jit.ignore def no_weight_decay(self): return {'norm'} def forward(self, x): input_size = x.size(2) scale = [4, 8, 16, 32] features = [None, None, None, None] x = self.stem(x) features[scale.index(input_size // x.size(2))] = x for idx, layer in enumerate(self.stages): x = layer(x) if input_size // x.size(2) in scale: features[scale.index(input_size // x.size(2))] = x return features def _init_weights(self, m): if isinstance(m, (nn.Conv2d, nn.Linear)): trunc_normal_(m.weight, std=.02) if m.bias is not None: nn.init.constant_(m.bias, 0) def _cfg(url='', **kwargs): return { 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), 'crop_pct': 0.875, 'interpolation': 'bicubic', 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'first_conv': 'stem.0', 'classifier': 'head.fc', **kwargs } def update_weight(model_dict, weight_dict): idx, temp_dict = 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): temp_dict[k] = v idx += 1 model_dict.update(temp_dict) print(f'loading weights... {idx}/{len(model_dict)} items') return model_dict default_cfgs = dict( inceptionnext_tiny=_cfg( url='https://github.com/sail-sg/inceptionnext/releases/download/model/inceptionnext_tiny.pth', ), inceptionnext_small=_cfg( url='https://github.com/sail-sg/inceptionnext/releases/download/model/inceptionnext_small.pth', ), inceptionnext_base=_cfg( url='https://github.com/sail-sg/inceptionnext/releases/download/model/inceptionnext_base.pth', ), inceptionnext_base_384=_cfg( url='https://github.com/sail-sg/inceptionnext/releases/download/model/inceptionnext_base_384.pth', input_size=(3, 384, 384), crop_pct=1.0, ), ) def inceptionnext_tiny(pretrained=False, **kwargs): model = MetaNeXt(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), token_mixers=InceptionDWConv2d, **kwargs ) model.default_cfg = default_cfgs['inceptionnext_tiny'] if pretrained: state_dict = torch.hub.load_state_dict_from_url(url=model.default_cfg['url'], map_location="cpu", check_hash=True) model.load_state_dict(state_dict) return model def inceptionnext_small(pretrained=False, **kwargs): model = MetaNeXt(depths=(3, 3, 27, 3), dims=(96, 192, 384, 768), token_mixers=InceptionDWConv2d, **kwargs ) model.default_cfg = default_cfgs['inceptionnext_small'] if pretrained: state_dict = torch.hub.load_state_dict_from_url(url=model.default_cfg['url'], map_location="cpu", check_hash=True) model.load_state_dict(state_dict) return model def inceptionnext_base(pretrained=False, **kwargs): model = MetaNeXt(depths=(3, 3, 27, 3), dims=(128, 256, 512, 1024), token_mixers=InceptionDWConv2d, **kwargs ) model.default_cfg = default_cfgs['inceptionnext_base'] if pretrained: state_dict = torch.hub.load_state_dict_from_url(url=model.default_cfg['url'], map_location="cpu", check_hash=True) model.load_state_dict(state_dict) return model def inceptionnext_base_384(pretrained=False, **kwargs): model = MetaNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], mlp_ratios=[4, 4, 4, 3], token_mixers=InceptionDWConv2d, **kwargs ) model.default_cfg = default_cfgs['inceptionnext_base_384'] if pretrained: state_dict = torch.hub.load_state_dict_from_url(url=model.default_cfg['url'], map_location="cpu", check_hash=True) model.load_state_dict(state_dict) return model if __name__ == '__main__': model = inceptionnext_tiny(pretrained=False) inputs = torch.randn((1, 3, 640, 640)) for i in model(inputs): print(i.size()) ================================================ FILE: yolo-improve/yolov5-backbone/main.py ================================================ import torch, timm from thop import clever_format, profile # print(timm.list_models()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dummy_input = torch.randn(1, 3, 640, 640).to(device) # model = timm.create_model('edgenext_small', pretrained=False, features_only=True) model = timm.create_model('vovnet39a', pretrained=False, features_only=True) model.to(device) model.eval() print(model.feature_info.channels()) for feature in model(dummy_input): print(feature.size()) flops, params = profile(model.to(device), (dummy_input,), verbose=False) flops, params = clever_format([flops * 2, params], "%.3f") print('Total FLOPS: %s' % (flops)) print('Total params: %s' % (params)) ================================================ FILE: yolo-improve/yolov5-backbone/yolo.py ================================================ def parse_model(d, ch): # model_dict, input_channels(3) # Parse a YOLOv5 model.yaml dictionary LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}") anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation') if act: Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU() LOGGER.info(f"{colorstr('activation:')} {act}") # print na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors no = na * (nc + 5) # number of outputs = anchors * (classes + 5) is_backbone = False layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args try: t = m m = eval(m) if isinstance(m, str) else m # eval strings except: pass for j, a in enumerate(args): with contextlib.suppress(NameError): try: args[j] = eval(a) if isinstance(a, str) else a # eval strings except: args[j] = a n = n_ = max(round(n * gd), 1) if n > 1 else n # depth gain if m in { Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x}: c1, c2 = ch[f], args[0] if c2 != no: # if not output c2 = make_divisible(c2 * gw, 8) args = [c1, c2, *args[1:]] if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}: args.insert(2, n) # number of repeats n = 1 elif m is nn.BatchNorm2d: args = [ch[f]] elif m is Concat: c2 = sum(ch[x] for x in f) # TODO: channel, gw, gd elif m in {Detect, Segment}: args.append([ch[x] for x in f]) if isinstance(args[1], int): # number of anchors args[1] = [list(range(args[1] * 2))] * len(f) if m is Segment: args[3] = make_divisible(args[3] * gw, 8) elif m is Contract: c2 = ch[f] * args[0] ** 2 elif m is Expand: c2 = ch[f] // args[0] ** 2 elif isinstance(m, str): t = m m = timm.create_model(m, pretrained=args[0], features_only=True) c2 = m.feature_info.channels() # elif m in {}: # m = m(*args) # c2 = m.channel else: c2 = ch[f] if isinstance(c2, list): is_backbone = True m_ = m m_.backbone = True else: m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module t = str(m)[8:-2].replace('__main__.', '') # module type np = sum(x.numel() for x in m_.parameters()) # number params m_.i, m_.f, m_.type, m_.np = i + 4 if is_backbone else i, f, t, np # attach index, 'from' index, type, number params LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f} {t:<40}{str(args):<30}') # print save.extend(x % (i + 4 if is_backbone else i) for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist layers.append(m_) if i == 0: ch = [] if isinstance(c2, list): ch.extend(c2) for _ in range(5 - len(ch)): ch.insert(0, 0) else: ch.append(c2) return nn.Sequential(*layers), sorted(save) def _forward_once(self, x, profile=False, visualize=False): y, dt = [], [] # outputs for m in self.model: if m.f != -1: # if not from previous layer x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers if profile: self._profile_one_layer(m, x, dt) if hasattr(m, 'backbone'): x = m(x) for _ in range(5 - len(x)): x.insert(0, None) for i_idx, i in enumerate(x): if i_idx in self.save: y.append(i) else: y.append(None) x = x[-1] else: x = m(x) # run y.append(x if m.i in self.save else None) # save output if visualize: feature_visualization(x, m.type, m.i, save_dir=visualize) return x ================================================ FILE: yolo-improve/yolov5-backbone/yolov5-custom.yaml ================================================ # YOLOv5 🚀 by Ultralytics, GPL-3.0 license # Parameters nc: 80 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.25 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # 0-P1/2 # 1-P2/4 # 2-P3/8 # 3-P4/16 # 4-P5/32 # YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, vovnet39a, [False]], # 4 [-1, 1, SPPF, [1024, 5]], # 5 ] # YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], # 6 [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 7 [[-1, 3], 1, Concat, [1]], # cat backbone P4 8 [-1, 3, C3, [512, False]], # 9 [-1, 1, Conv, [256, 1, 1]], # 10 [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 11 [[-1, 2], 1, Concat, [1]], # cat backbone P3 12 [-1, 3, C3, [256, False]], # 13 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], # 14 [[-1, 10], 1, Concat, [1]], # cat head P4 15 [-1, 3, C3, [512, False]], # 16 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], # 17 [[-1, 5], 1, Concat, [1]], # cat head P5 18 [-1, 3, C3, [1024, False]], # 19 (P5/32-large) [[13, 16, 19], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov5-dyhead.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_activation_layer, build_norm_layer from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d from mmengine.model import constant_init, normal_init def _make_divisible(v, divisor, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class swish(nn.Module): def forward(self, x): return x * torch.sigmoid(x) class h_swish(nn.Module): def __init__(self, inplace=False): super(h_swish, self).__init__() self.inplace = inplace def forward(self, x): return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0 class h_sigmoid(nn.Module): def __init__(self, inplace=True, h_max=1): super(h_sigmoid, self).__init__() self.relu = nn.ReLU6(inplace=inplace) self.h_max = h_max def forward(self, x): return self.relu(x + 3) * self.h_max / 6 class DyReLU(nn.Module): def __init__(self, inp, reduction=4, lambda_a=1.0, K2=True, use_bias=True, use_spatial=False, init_a=[1.0, 0.0], init_b=[0.0, 0.0]): super(DyReLU, self).__init__() self.oup = inp self.lambda_a = lambda_a * 2 self.K2 = K2 self.avg_pool = nn.AdaptiveAvgPool2d(1) self.use_bias = use_bias if K2: self.exp = 4 if use_bias else 2 else: self.exp = 2 if use_bias else 1 self.init_a = init_a self.init_b = init_b # determine squeeze if reduction == 4: squeeze = inp // reduction else: squeeze = _make_divisible(inp // reduction, 4) # print('reduction: {}, squeeze: {}/{}'.format(reduction, inp, squeeze)) # print('init_a: {}, init_b: {}'.format(self.init_a, self.init_b)) self.fc = nn.Sequential( nn.Linear(inp, squeeze), nn.ReLU(inplace=True), nn.Linear(squeeze, self.oup * self.exp), h_sigmoid() ) if use_spatial: self.spa = nn.Sequential( nn.Conv2d(inp, 1, kernel_size=1), nn.BatchNorm2d(1), ) else: self.spa = None def forward(self, x): if isinstance(x, list): x_in = x[0] x_out = x[1] else: x_in = x x_out = x b, c, h, w = x_in.size() y = self.avg_pool(x_in).view(b, c) y = self.fc(y).view(b, self.oup * self.exp, 1, 1) if self.exp == 4: a1, b1, a2, b2 = torch.split(y, self.oup, dim=1) a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0 a2 = (a2 - 0.5) * self.lambda_a + self.init_a[1] b1 = b1 - 0.5 + self.init_b[0] b2 = b2 - 0.5 + self.init_b[1] out = torch.max(x_out * a1 + b1, x_out * a2 + b2) elif self.exp == 2: if self.use_bias: # bias but not PL a1, b1 = torch.split(y, self.oup, dim=1) a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0 b1 = b1 - 0.5 + self.init_b[0] out = x_out * a1 + b1 else: a1, a2 = torch.split(y, self.oup, dim=1) a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0 a2 = (a2 - 0.5) * self.lambda_a + self.init_a[1] out = torch.max(x_out * a1, x_out * a2) elif self.exp == 1: a1 = y a1 = (a1 - 0.5) * self.lambda_a + self.init_a[0] # 1.0 out = x_out * a1 if self.spa: ys = self.spa(x_in).view(b, -1) ys = F.softmax(ys, dim=1).view(b, 1, h, w) * h * w ys = F.hardtanh(ys, 0, 3, inplace=True)/3 out = out * ys return out class DyDCNv2(nn.Module): """ModulatedDeformConv2d with normalization layer used in DyHead. This module cannot be configured with `conv_cfg=dict(type='DCNv2')` because DyHead calculates offset and mask from middle-level feature. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. stride (int | tuple[int], optional): Stride of the convolution. Default: 1. norm_cfg (dict, optional): Config dict for normalization layer. Default: dict(type='GN', num_groups=16, requires_grad=True). """ def __init__(self, in_channels, out_channels, stride=1, norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)): super().__init__() self.with_norm = norm_cfg is not None bias = not self.with_norm self.conv = ModulatedDeformConv2d( in_channels, out_channels, 3, stride=stride, padding=1, bias=bias) if self.with_norm: self.norm = build_norm_layer(norm_cfg, out_channels)[1] def forward(self, x, offset, mask): """Forward function.""" x = self.conv(x.contiguous(), offset, mask) if self.with_norm: x = self.norm(x) return x class DyHeadBlock(nn.Module): """DyHead Block with three types of attention. HSigmoid arguments in default act_cfg follow official code, not paper. https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py """ def __init__(self, in_channels, norm_type='GN', zero_init_offset=True, act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)): super().__init__() self.zero_init_offset = zero_init_offset # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x self.offset_and_mask_dim = 3 * 3 * 3 self.offset_dim = 2 * 3 * 3 if norm_type == 'GN': norm_dict = dict(type='GN', num_groups=16, requires_grad=True) elif norm_type == 'BN': norm_dict = dict(type='BN', requires_grad=True) self.spatial_conv_high = DyDCNv2(in_channels, in_channels, norm_cfg=norm_dict) self.spatial_conv_mid = DyDCNv2(in_channels, in_channels) self.spatial_conv_low = DyDCNv2(in_channels, in_channels, stride=2) self.spatial_conv_offset = nn.Conv2d( in_channels, self.offset_and_mask_dim, 3, padding=1) self.scale_attn_module = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Conv2d(in_channels, 1, 1), nn.ReLU(inplace=True), build_activation_layer(act_cfg)) self.task_attn_module = DyReLU(in_channels) self._init_weights() def _init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): normal_init(m, 0, 0.01) if self.zero_init_offset: constant_init(self.spatial_conv_offset, 0) def forward(self, x): """Forward function.""" outs = [] for level in range(len(x)): # calculate offset and mask of DCNv2 from middle-level feature offset_and_mask = self.spatial_conv_offset(x[level]) offset = offset_and_mask[:, :self.offset_dim, :, :] mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid() mid_feat = self.spatial_conv_mid(x[level], offset, mask) sum_feat = mid_feat * self.scale_attn_module(mid_feat) summed_levels = 1 if level > 0: low_feat = self.spatial_conv_low(x[level - 1], offset, mask) sum_feat += low_feat * self.scale_attn_module(low_feat) summed_levels += 1 if level < len(x) - 1: # this upsample order is weird, but faster than natural order # https://github.com/microsoft/DynamicHead/issues/25 high_feat = F.interpolate( self.spatial_conv_high(x[level + 1], offset, mask), size=x[level].shape[-2:], mode='bilinear', align_corners=True) sum_feat += high_feat * self.scale_attn_module(high_feat) summed_levels += 1 outs.append(self.task_attn_module(sum_feat / summed_levels)) return outs [17, 1, Conv, [128, 1, 1]], [20, 1, Conv, [128, 1, 1]], [23, 1, Conv, [128, 1, 1]], [[24, 25, 26], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) self.dyhead = nn.Sequential(*[DyHeadBlock(ch[0]) for i in range(2)]) for dyhead_layer in self.dyhead: x = dyhead_layer(x) ================================================ FILE: yolo-improve/yolov5-res2block.py ================================================ class Bottle2neck(nn.Module): expansion = 1 def __init__(self, inplanes, planes, shortcut, baseWidth=26, scale = 4): """ Constructor Args: inplanes: input channel dimensionality planes: output channel dimensionality baseWidth: basic width of conv3x3 scale: number of scale. """ super(Bottle2neck, self).__init__() width = int(math.floor(planes * (baseWidth/64.0))) self.conv1 = Conv(inplanes, width*scale, k=1) if scale == 1: self.nums = 1 else: self.nums = scale -1 convs = [] for i in range(self.nums): convs.append(Conv(width, width, k=3)) self.convs = nn.ModuleList(convs) self.conv3 = Conv(width*scale, planes * self.expansion, k=1, act=False) self.silu = nn.SiLU(inplace=True) self.scale = scale self.width = width self.shortcut = shortcut def forward(self, x): print(1) if self.shortcut: residual = x out = self.conv1(x) spx = torch.split(out, self.width, 1) for i in range(self.nums): if i==0: sp = spx[i] else: sp = sp + spx[i] sp = self.convs[i](sp) if i==0: out = sp else: out = torch.cat((out, sp), 1) if self.scale != 1: out = torch.cat((out, spx[self.nums]),1) out = self.conv3(out) if self.shortcut: out += residual out = self.silu(out) return out class C3_Res2Block(C3): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) # hidden channels self.m = nn.Sequential(*(Bottle2neck(c_, c_, shortcut) for _ in range(n))) ================================================ FILE: yolo-improve/yolov5-softnms.py ================================================ def box_iou_for_nms(box1, box2, GIoU=False, DIoU=False, CIoU=False, SIoU=False, EIou=False, eps=1e-7): # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clamp(eps) w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clamp(eps) # Intersection area inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \ (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0) # Union Area union = w1 * h1 + w2 * h2 - inter + eps # IoU iou = inter / union if CIoU or DIoU or GIoU or EIou: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU or EIou: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) with torch.no_grad(): alpha = v / (v - iou + (1 + eps)) return iou - (rho2 / c2 + v * alpha) # CIoU elif EIou: rho_w2 = ((b2_x2 - b2_x1) - (b1_x2 - b1_x1)) ** 2 rho_h2 = ((b2_y2 - b2_y1) - (b1_y2 - b1_y1)) ** 2 cw2 = cw ** 2 + eps ch2 = ch ** 2 + eps return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2) return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf elif SIoU: # SIoU Loss https://arxiv.org/pdf/2205.12740.pdf s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5) sin_alpha_1 = torch.abs(s_cw) / sigma sin_alpha_2 = torch.abs(s_ch) / sigma threshold = pow(2, 0.5) / 2 sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1) angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) rho_x = (s_cw / cw) ** 2 rho_y = (s_ch / ch) ** 2 gamma = angle_cost - 2 distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y) omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4) return iou - 0.5 * (distance_cost + shape_cost) return iou # IoU def soft_nms(bboxes, scores, iou_thresh=0.5,sigma=0.5,score_threshold=0.25): order = torch.arange(0, scores.size(0)).to(bboxes.device) keep = [] while order.numel() > 1: if order.numel() == 1: keep.append(order[0]) break else: i = order[0] keep.append(i) iou = box_iou_for_nms(bboxes[i], bboxes[order[1:]]).squeeze() idx = (iou > iou_thresh).nonzero().squeeze() if idx.numel() > 0: iou = iou[idx] newScores = torch.exp(-torch.pow(iou,2)/sigma) scores[order[idx+1]] *= newScores newOrder = (scores[order[1:]] > score_threshold).nonzero().squeeze() if newOrder.numel() == 0: break else: maxScoreIndex = torch.argmax(scores[order[newOrder+1]]) if maxScoreIndex != 0: newOrder[[0,maxScoreIndex],] = newOrder[[maxScoreIndex,0],] order = order[newOrder+1] return torch.LongTensor(keep) ================================================ FILE: yolo-improve/yolov5v7-light.md ================================================ # YOLOV5,YOLOV7剪枝和蒸馏项目介绍((不包含v8,但入手过这个剪枝项目,后续v8也会有对应的优惠)) ##### 对于群里的剪枝相关问题,我基本都会回复,对于一些剪枝问题,我都会给出建议。 ### 首先剪枝是什么? 模型剪枝是深度学习中的一种技术,旨在通过减少神经网络中不必要的参数和连接,来优化模型的效率和性能。模型剪枝可以分为结构剪枝和参数剪枝两种类型。 ### 为什么需要剪枝? 剪枝可以很好地衡量模型轻量化程度与精度的关系,是替换轻量化结构完全没办法比的,比如我模型剪枝可以压缩百分之30的计算量,精度只下降了百分之1,但是你通过换模块来达到压缩百分之30的计算量,一般时间就会变长,因为大部分轻量化模块都是由时间换空间,而且精度还会下降得比较多,但是剪枝可以很好地避免这个问题. ### 目前剪枝项目包含: 1. yolov5-PAGCP 2. yolov7-PAGCP 3. yolov7-prune 4. yolov5-prune ### 其中prune中的剪枝方法包含: 1. L1 2. Random 3. Slim 4. GroupSlim 5. GroupNorm 6. LAMP 7. GroupSL 8. GroupReg 9. GroupHessian 10. GroupTaylor ### 其中prune系列还有一些细节: 1. 支持稀疏训练时候可视化BN稀疏程度和数值。 2. 稀疏训练的稀疏系数会进行线性调整,让稀疏训练后期精度更容易回升,更稳定。 3. 支持设定加速比例,模型会进行自动压缩,压缩到指定比例或者达到最大压缩次数后会自动进入finetune。 ### 剪枝的一些顾虑 大家关心最多的一个问题就是,我的结构能不能剪之类的,目前剪枝都是基于Torch_Pruning库进行剪枝,其中PAGCP是版本比较旧的Torch_Pruning库,prune系列的都是最新Torch_Pruning库,所以PAGCP剪枝上兼容性会比prune系列的低,prune系列的可以跳过一些不能剪枝的层(某些复杂的结构可能在构建动态图的时候失败,这些就只能换结构),这个项目会有比较多的示例和视频教程教大家如何去剪自己的结构,注意点在哪里等等。这个剪枝项目是没办法保证所有的结构都能剪,有一定的风险,是否入手请自行考虑! ### 目前蒸馏方法包含: 1. Logical 1. L1 2. L2 3. AlignSoftTarget(自研,部分参考[Bridging Cross-task Protocol Inconsistency for Distillation in Dense Object Detection,ICCV 2023]((https://link.zhihu.com/?target=https%3A//arxiv.org//pdf/2308.14286))) 2. Feature 1. [Mimic](https://openaccess.thecvf.com/content_cvpr_2017/papers/Li_Mimicking_Very_Efficient_CVPR_2017_paper.pdf) 2. [Masked Generative Distillation](https://link.zhihu.com/?target=https%3A//arxiv.org/pdf/2205.01529.pdf) (ECCV 2022) 3. [Channel-wise Distillation](https://arxiv.org/pdf/2011.13256.pdf) (ICCV 2021) 4. [ChSimLoss Distillation](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Exploring_Inter-Channel_Correlation_for_Diversity-Preserved_Knowledge_Distillation_ICCV_2021_paper.html) (ICCV2021) 5. [SPKDLoss Distillation](https://arxiv.org/pdf/1907.09682.pdf) (ICCV2019) ### 知识蒸馏的一些细节(具体项目会提供视频讲解) 1. Feature蒸馏可以自定义选择层进行蒸馏. 2. 蒸馏损失支持常数,线性,余弦进行动调整. 3. 支持Logical和Feature一起使用. 4. 过程中会输出Logical和Feature的损失,让用户可以及时调整对应的损失系数. 5. 支持正常训练模型时候进行蒸馏和剪枝后finetune蒸馏. # 实验示例结果.(以下示例实验相关命令,视频教程,实验数据都在项目里面) ### Sparse:代表需要进行稀疏训练. ### 2.0x 代表的是设定为两倍加速(4.0x同理),当模型压缩达到设定的倍速时会自动进入finetune阶段. ### Yolov7 相关实验 #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov7-Tiny using OTA | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 6,010,302 | 13.0 | 12.0m | 0.76 | 0.429 | 0.6ms | | PAGCP-EXP1 | 3,239,782(53.9%) | 7.5(57.6%) | 6.4m(53.3%) | 0.747(-0.013) | 0.409(-0.02) | 0.5ms | | PAGCP-EXP2 | 2,035,468(33.8%) | 5.0(38.4%) | 4.1m(34.2%) | 0.731(-0.029) | 0.393(-0.026) | 0.5ms | | Slim(Sparse) 2.0x | 920,155(15.3%) | 6.2(47.7%) | 2.0m(16.7%) | 0.773(+0.013) | 0.429(0.0) | 0.6ms | | Slim(Sparse) 4.0x | 375,449(6.2%) | 3.2(24.6%) | 1.0m(8.3%) | 0.73(-0.03) | 0.376(-0.053) | 0.4ms | | GroupSlim (Sparse) 2.0x | 915,589(15.2%) | 6.4(49.2%) | 2.0m(16.7%) | 0.772(+0.012) | 0.43(+0.001) | 0.6ms | | GroupSlim (Sparse) 4.0x | 375,298(6.3%) | 3.2(24.6%) | 1.0m(8.3%) | 0.727(-0.033) | 0.372(-0.057) | 0.5ms | | LAMP 2.0x | 1,310,893(21.81%) | 6.5(50.0%) | 2.9m(24.1%) | 0.766(+0.006) | 0.423(-0.006) | 0.6ms | | GroupNorm 2.0x | 2,580,758(42.9%) | 6.5(50.0%) | 5.4m(41.5%) | 0.74(-0.02) | 0.398(-0.021) | 0.6ms | | Random 2.0x | 2,950,989(49.1%) | 6.5(50.0%) | 6.1m(46.9%) | 0.742(-0.018) | 0.399(-0.02) | 0.6ms | | L1 2.0x | 3,226,567(53.7%) | 6.4(49.2%) | 6.4m(56.3%) | 0.72(-0.04) | 0.387(0.042) | 0.6ms | #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov7-Tiny+MobileNetV3_Small+LSKBlock+TSOCDE+RepConv | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 24,665,523 | 33.0 | 48.0m | 0.68 | 0.36 | 1.5ms | | LAMP 2.0x | 8,963,220(36.3%) | 16.4(49.7%) | 18.0m(37.5%) | 0.676(-0.004) | 0.354(-0.006) | 1.3ms | | GroupSlim (Sparse) 2.0x | 10,686,041(43.3%) | 16.2(49.1%) | 22.0m(45.8%) | 0.641(-0.039) | 0.319(-0.041) | 1.4ms | | Slim (Sparse) 2.0x |9,211,532(37.3%) | 16.3(49.4%) | 19.0m(39.6%) | 0.669(-0.011) | 0.342(-0.018) | 1.4ms | | L1 1.5x | 21,384,927(86.7%) | 21.8(66.1%) | 42.0m(87.5%) | 0.45(-0.23) | 0.185(-0.175) | 1.4ms | #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov7-Tiny+DCN+AFPN | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 4,564,641 | 11.7 | 9.1m | 0.716 | 0.388 | 0.8ms | | LAMP 2.0x | 2,323,337(50.9%) | 5.8(49.6%) | 4.8m(52.7%) | 0.7(-0.016) | 0.372(-0.016) | 0.7ms | | L1 2.0x | 3,469,961(76.0%) | 5.8(49.6%) | 7.0m(76.9%) | 0.54(-0.176) | 0.268(-0.12) | 0.7ms | | Slim (Sparse) 2.0x | 2,385,252(52.2%) | 5.8(49.6%) | 5.8m(64.8%) | 0.641(-0.075) | 0.327(-0.061) | 0.7ms | #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov7-Tiny+FasterNet+DiverseBranchBlock | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 4,092,258 | 8.5 | 9.8m | 0.69 | 0.358 | 0.6ms | | LAMP 2.0x | 1,392,932(34.0%) | 3.6(42.3%) | 4.4m(44.9%) | 0.67(-0.02) | 0.339(-0.019) | 0.5ms | | Slim (Sparse) 2.0x | 1,541,346(37.7%) | 3.6(42.3%) | 4.7m(48.0%) | 0.669(-0.176) | 0.337(-0.021) | 0.5ms | | GroupSlim (Sparse) 2.0x | 1,545,707(37.8%) | 3.6(42.3%) | 4.7m(48.0%) | 0.674(-0.016) | 0.342(-0.016) | 0.5ms | | GroupNorm 2.0x | 2,141,255(52.3%) | 3.7(43.5%) | 5.8m(59.2%) | 0.214(-0.476) | 0.0535(-0.305) | 0.5ms | #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov7-Tiny+ReXNet(CVPR2021)+VoVGSCSP+DyHead+DecoupledHead | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 6,858,519 | 14.8 | 13.6m | 0.731 | 0.405 | 0.14s | | LAMP 1.5x | 3,840,822(56.0%) | 9.9(66.9%) | 7.8m(57.3%) | 0.7(-0.031) | 0.379(-0.019) | 0.09s | | LAMP 2.0x | 2,821,109(41.1%) | 7.4(50.0%) | 5.8m(42.6%) | 0.681(-0.06) | 0.359(-0.046) | 0.08s | #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov7-Tiny+ReXNet(CVPR2021)+VoVGSCSP+DecoupledHead | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 6,512,095 | 11.3 | 12.9m | 0.715 | 0.383 | 0.091s | | LAMP 2.0x | 2,930,100(45.0%) | 5.6(49.6%) | 6.0m(46.5%) | 0.627(-0.088) | 0.32(-0.063) | 0.039s | | Slim (Sparse) 2.0x | 2,821,109(43.3%) | 5.6(49.6%) | 6.3m(48.8%) | 0.728(+0.013) | 0.373(+0.01) | 0.052s | | GroupSlim (Sparse) 2.0x | 3,304,167(50.7%) | 5.7(50.4%) | 6.8m(52.7%) | 0.724(+0.009) | 0.369(-0.014) | 0.053s | | GroupSl (Sparse) 2.0x Exp1 | 2,178,723(33.5%) | 5.7(50.4%) | 4.6m(35.7%) | 0.669(-0.046) | 0.341(-0.042) | 0.055s | | GroupSl (Sparse) 2.0x Exp2 | 2,060,599(31.6%) | 5.6(49.6%) | 4.4m(34.1%) | 0.761(+0.046) | 0.407(+0.024) | 0.056s | | GroupSl (Sparse) 3.0x Exp2 | 1,283,982(19.7%) | 3.7(32.7%) | 2.9m(22.5%) | 0.679(-0.036) | 0.342(-0.041) | 0.041s | #### Mode:Distill+Prune Dataset:VisDrone(训练集只用了百分之20的数据,验证集和测试集用了全量的数据) Teacher:Yolov7-Tiny | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine(Yolov7-Tiny) | 6,031,950 | 13.1 | 11.7m | 0.189 | 0.0948 | 0.00121s | | LAMP 2.0x | 1,309,098 | 6.5 | 2.7m | 0.186(-0.003) | 0.0903(-0.0045) | 0.00089s | | LAMP 3.0x | 615,877 | 4.3 | 1.4m | 0.151(-0.038) | 0.0691(-0.0257) | 0.00070s | | LAMP 3.0x + CWD exp1 | 615,877 | 4.3 | 1.4m | 0.158(-0.031) | 0.0715(-0.0233) | 0.00070s | | LAMP 3.0x + CWD exp2 | 615,877 | 4.3 | 1.4m | 0.155(-0.034) | 0.0686(-0.0262) | 0.00070s | ### Yolov5 相关实验 #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov5n | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 1,761,871 | 4.1 | 3.7m | 0.715 | 0.399 | 0.02s | | LAMP 2.0x | 296,498(16.8%) | 2.0(48.8%) | 0.9m(24.3%) | 0.694(-0.021) | 0.368(-0.031) | 0.0164s | | Slim (Sparse) 2.0x | 398,607(22.6%) | 2.0(48.8%) | 1.1m(29.7%) | 0.707(-0.008) | 0.38(-0.019) | 0.0166s | | GroupSlim (Sparse) 2.0x | 366,230(20.8%) | 2.0(48.8%) | 1.0m(27.0%) | 0.704(-0.011) | 0.381(-0.018) | 0.0165s | | GroupNorm 2.0x | 1,016,400(57.7%) | 2.1(51.2%) | 2.3m(62.2%) | 0.617(-0.098) | 0.312(-0.087) | 0.0134s | | GroupSl (Sparse) 2.0x | 474,024(26.9%) | 2.0(48.8%) | 1.3m(35.1%) | 0.711(-0.004) | 0.387(-0.012) | 0.0167s | #### Mode:Prune Dataset:CrowdHuman 20% Model:Yolov5n+C3-Faster+RepConv | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 1,614,495 | 3.7 | 3.4m | 0.711 | 0.388 | 0.021s | | LAMP 2.0x | 285,554(17.7%) | 1.8(48.6%) | 0.9m(26.5%) | 0.687(-0.024) | 0.359(-0.029) | 0.017s | | Slim (Sparse) 2.0x | 418,550(25.9%) | 1.8(48.6%) | 1.2m(35.3%) | 0.695(-0.026) | 0.365(-0.023) | 0.168s | | GroupSlim (Sparse) 2.0x | 434,440(26.9%) | 1.8(48.6%) | 1.2m(35.3%) | 0.698(-0.013) | 0.369(-0.019) | 0.017s | | GroupSl (Sparse) 2.0x | 447,587(27.7%) | 1.8(48.6%) | 1.2m(35.3%) | 0.704(-0.007) | 0.376(-0.012) | 0.016s | | GroupNorm 2.0x | 935,451(57.9%) | 1.8(48.6%) | 2.1m(61.8%) | 0.652(-0.059) | 0.335(-0.053) | 0.015s | #### Mode:Distill Dataset:VisDrone(训练集只用了百分之20的数据,验证集和测试集用了全量的数据) Teacher:Yolov5s+OTA Student:Yolov5n #### Epoch:300 BatchSize:64 Device:RTX3090 | model | GFLOPs | mAP50(test set) | mAP50-95(test set) | | :----: | :----: | :----: | :----: | | yolov5n | 4.2 | 0.171 | 0.0834 | | yolov5s | 15.8 | 0.263 | 0.136 | | yolov5n cwd exp1 | 4.2 | 0.181(+0.01) | 0.0898(+0.0064) | | yolov5n cwd exp2 | 4.2 | 0.188(+0.017) | 0.0931(+0.0097) | | yolov5n cwd exp3 | 4.2 | 0.176(+0.005) | 0.0845(+0.0011) | | yolov5n cwd exp4 | 4.2 | 0.175(+0.004) | 0.0852(+0.0018) | | yolov5n mgd exp1 | 4.2 | 0.181(+0.01) | 0.0883(+0.0049) | | yolov5n mgd exp2 | 4.2 | 0.166(-0.005) | 0.0795(-0.0039) | | yolov5n mimic exp1 | 4.2 | 0.178(+0.007) | 0.0865(+0.0031) | | yolov5n mimic exp1 | 4.2 | 0.172(+0.001) | 0.0833(-0.0001) | | yoplov5n l2 exp1 | 4.2 | 0.178(+0.007) | 0.0844(+0.001) | | yolov5n l2 exp2 | 4.2 | 0.179(+0.008) | 0.0834(0.0) | | yolov5n l2 exp3 | 4.2 | 0.176(+0.005) | 0.0795(-0.0039) | | yolov5n ast exp1 | 4.2 | 0.185(+0.014) | 0.0899(+0.0065) | | yolov5n ast exp2 | 4.2 | 0.189(+0.018) | 0.0908(+0.0074) | | yolov5n mgd+ast exp1 | 4.2 | 0.182(+0.011) | 0.0867(+0.0033) | | yolov5n mgd+ast exp2 | 4.2 | 0.185(+0.014) | 0.0902(+0.0068) | | yolov5n mgd+ast exp3 | 4.2 | 0.183(+0.012) | 0.0886(+0.0052) | #### Mode:Distill+Prune Dataset:VisDrone(训练集只用了百分之20的数据,验证集和测试集用了全量的数据) Teacher:Yolov5s+OTA | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine(Yolov5n) | 1,772,695 | 4.2 | 3.7m | 0.171 | 0.0834 | 0.020s | | LAMP 2.0x | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.149(-0.022) | 0.0676(-0.0158) | 0.016s | | LAMP 2.0x + cwd exp1 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.163(+0.014) | 0.0745(+0.0069) | 0.016s | | LAMP 2.0x + cwd exp2 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.158(+0.009) | 0.0728(+0.0052) | 0.016s | | LAMP 2.0x + cwd exp3 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.164(+0.015) | 0.0742(+0.0066) | 0.016s | | LAMP 2.0x + mgd exp1 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.148(-0.001) | 0.066(-0.0016) | 0.016s | | LAMP 2.0x + mgd exp2 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.148(-0.001) | 0.0673(-0.0003) | 0.016s | | LAMP 2.0x + mgd exp3 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.152(+0.003) | 0.0687(+0.0011) | 0.016s | | LAMP 2.0x + l2 exp1 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.137(-0.012) | 0.0542(-0.0134) | 0.016s | | LAMP 2.0x + l2 exp2 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.149(+0.000) | 0.0638(+0.0011) | 0.016s | | LAMP 2.0x + ast exp1 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.154(+0.005) | 0.0679(+0.0003) | 0.016s | | LAMP 2.0x + ast exp2 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.152(+0.003) | 0.0693(+0.0017) | 0.016s | | LAMP 2.0x + ast exp3 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.154(+0.005) | 0.0652(-0.0024) | 0.016s | | LAMP 2.0x + ast exp4 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.125(-0.024) | 0.0547(-0.0129) | 0.016s | | LAMP 2.0x + ast exp5 | 301,033(16.98%) | 2.1(50%) | 0.8m(21.62%) | 0.141(-0.008) | 0.0635(-0.0041) | 0.016s | | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine(Yolov5n) | 1,772,695 | 4.2 | 3.7m | 0.171 | 0.0834 | 0.020s | | GroupSl (Sparse) 2.0x | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.162(-0.009) | 0.0754(-0.008) | 0.017s | | GroupSl (Sparse) 2.0x + cwd exp1 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.174(+0.012) | 0.0817(+0.0063) | 0.017s | | GroupSl (Sparse) 2.0x + cwd exp2 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.177(+0.015) | 0.0815(+0.0061) | 0.017s | | GroupSl (Sparse) 2.0x + cwd exp3 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.177(+0.015) | 0.08(+0.0046) | 0.017s | | GroupSl (Sparse) 2.0x + cwd exp4 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.174(+0.012) | 0.0813(+0.0059) | 0.017s | | GroupSl (Sparse) 2.0x + cwd exp5 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.173(+0.011) | 0.0808(+0.0054) | 0.017s | | GroupSl (Sparse) 2.0x + mgd exp1 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.151(-0.011) | 0.0662(-0.0092) | 0.017s | | GroupSl (Sparse) 2.0x + mgd exp2 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.164(+0.002) | 0.0771(+0.0017) | 0.017s | | GroupSl (Sparse) 2.0x + mgd exp3 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.154(-0.08) | 0.0691(-0.0063) | 0.017s | | GroupSl (Sparse) 2.0x + mgd exp4 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.166(+0.004) | 0.0774(+0.002) | 0.017s | | GroupSl (Sparse) 2.0x + ast exp1 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.172(+0.01) | 0.0776(+0.0022) | 0.017s | | GroupSl (Sparse) 2.0x + ast exp2 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.167(+0.005) | 0.0763(+0.0009) | 0.017s | | GroupSl (Sparse) 2.0x + ast exp3 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.17(+0.008) | 0.0754(+0.0) | 0.017s | | GroupSl (Sparse) 2.0x + cwd + ast exp1 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.169(+0.007) | 0.0746(-0.008) | 0.017s | | GroupSl (Sparse) 2.0x + cwd + ast exp2 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.172(+0.01) | 0.078(+0.0026) | 0.017s | | GroupSl (Sparse) 2.0x + cwd + ast exp3 | 330,322(18.63%) | 2.1(50%) | 0.8m(21.62%) | 0.172(+0.01) | 0.0786(+0.0032) | 0.017s | #### Mode:Prune Dataset:CrowdHuman 20%train Model:Yolov5n+RepViT+C2f | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine(Yolov5n) | 1,761,871 | 4.1 | 3.7M | 0.692 | 0.37 | 0.00062s | | Yolov5n+RepVit+C2f | 6,001,647(340.6%) | 16.2(395.1%) | 12.1M(327.0%) | 0.711(+0.019) | 0.386(+0.016) | 0.00262s | | Yolov5n+RepVit+C2f Lamp 2.0x | 2,318,239(131.5%) | 8.2(200%) | 5.0M(135.1%) | 0.721(+0.029) | 0.398(+0.028) | 0.00218s | | Yolov5n+RepVit+C2f Lamp 3.0x | 1,446,593(82.1%) | 5.6(136.6%) | 3.3M(89.2%) | 0.712(+0.02) | 0.388(+0.018) | 0.00197s | | Yolov5n+RepVit+C2f Lamp 3.5x | 1,231,668(69.9%) | 4.8(117.1%) | 2.9M(78.4%) | 0.71(+0.018) | 0.383(+0.013) | 0.00189s | | Yolov5n+RepVit+C2f Lamp 4.0x | 1,082,684(61.5%) | 4.3(104.9%) | 2.7M(73.0%) | 0.705(+0.013) | 0.378(+0.008) | 0.00185s | | Yolov5n+RepVit+C2f Lamp 5.0x | 897,472(50.9%) | 3.4(82.9%) | 2.3M(62.2%) | 0.69(-0.002) | 0.364(-0.006) | 0.00178s | | Yolov5n+RepVit+C2f GroupSl (Sparse) 2.0x | 1,695,853(96.3%) | 8.2(200%) | 3.8M(102.7%) | 0.694(+0.002) | 0.364(-0.006) | 0.022s | | Yolov5n+RepVit+C2f Slim (Sparse) 2.0x | 3,006,781(170.7%) | 8.1(197.6%) | 6.3M(170.3%) | 0.707(+0.015) | 0.376(+0.006) | 0.00206s | | Yolov5n+RepVit+C2f Slim (Sparse) 3.0x | 1,945,689(110.4%) | 5.6(136.6%) | 4.3M(116.2%) | 0.683(-0.009) | 0.348(-0.022) | 0.00189s | | Yolov5n+RepVit+C2f Slim (Sparse) 4.0x | 1,411,170(80.1%) | 4.2(102.4%) | 3.3M(89.2%) | 0.662(-0.03) | 0.331(-0.039) | 0.0018s | #### Mode:Prune Dataset:CrowdHuman 20%train Model:Yolov5n+Fasternet+GoldYOLO+ASF+OTA | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine(Yolov5n) | 1,761,871 | 4.1 | 3.7M | 0.688 | 0.365 | 0.00062s | | Improve(Yolov5n+Fasternet+GoldYOLO+ASF+OTA) | 6,442,926(365.7%) | 10.5(256.1%) | 12.8M(345.9%) | 0.739(+0.051) | 0.395(+0.03) | 0.00221s(356.4%) | | Improve Lamp 2.0x | 3,753,930(213.1%) | 5.2(126.8%) | 7.6M(205.4%) | 0.732(+0.044) | 0.391(+0.026) | 0.00117s(188.7%) | | Improve Lamp 2.5x | 3,414,584(193.8%) | 4.2(102.4%) | 7.0M(189.2%) | 0.721(+0.033) | 0.377(+0.012) | 0.00097s(156.5%) | | Improve Lamp 3.0x | 3,198,691(181.6%) | 3.5(85.3%) | 6.6M(178.4%) | 0.7(+0.012) | 0.357(-0.08) | 0.00083s(133.9%) | ================================================ FILE: yolo-improve/yolov7-CoordConv.py ================================================ class AddCoords(nn.Module): def __init__(self, with_r=False): super().__init__() self.with_r = with_r def forward(self, input_tensor): """ Args: input_tensor: shape(batch, channel, x_dim, y_dim) """ batch_size, _, x_dim, y_dim = input_tensor.size() xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1) yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose(1, 2) xx_channel = xx_channel.float() / (x_dim - 1) yy_channel = yy_channel.float() / (y_dim - 1) xx_channel = xx_channel * 2 - 1 yy_channel = yy_channel * 2 - 1 xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3) yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3) ret = torch.cat([ input_tensor, xx_channel.type_as(input_tensor), yy_channel.type_as(input_tensor)], dim=1) if self.with_r: rr = torch.sqrt(torch.pow(xx_channel.type_as(input_tensor) - 0.5, 2) + torch.pow(yy_channel.type_as(input_tensor) - 0.5, 2)) ret = torch.cat([ret, rr], dim=1) return ret class CoordConv(nn.Module): def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, with_r=False): super().__init__() self.addcoords = AddCoords(with_r=with_r) in_channels += 2 if with_r: in_channels += 1 self.conv = Conv(in_channels, out_channels, k=kernel_size, s=stride) def forward(self, x): x = self.addcoords(x) x = self.conv(x) return x # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 51 [-1, 1, CoordConv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [37, 1, CoordConv, [256, 1, 1]], # route backbone P4 [[-1, -2], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], [-2, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], # 63 [-1, 1, CoordConv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [24, 1, CoordConv, [128, 1, 1]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, Conv, [128, 1, 1]], [-2, 1, Conv, [128, 1, 1]], [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [64, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [128, 1, 1]], # 75 [-1, 1, MP, []], [-1, 1, Conv, [128, 1, 1]], [-3, 1, Conv, [128, 1, 1]], [-1, 1, Conv, [128, 3, 2]], [[-1, -3, 63], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], [-2, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], # 88 [-1, 1, MP, []], [-1, 1, Conv, [256, 1, 1]], [-3, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [256, 3, 2]], [[-1, -3, 51], 1, Concat, [1]], [-1, 1, Conv, [512, 1, 1]], [-2, 1, Conv, [512, 1, 1]], [-1, 1, Conv, [256, 3, 1]], [-1, 1, Conv, [256, 3, 1]], [-1, 1, Conv, [256, 3, 1]], [-1, 1, Conv, [256, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [512, 1, 1]], # 101 [75, 1, CoordConv, [256, 3, 1]], [88, 1, CoordConv, [512, 3, 1]], [101, 1, CoordConv, [1024, 3, 1]], [[102,103,104], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-DBB.py ================================================ import torch.nn.functional as F def transI_fusebn(kernel, bn): gamma = bn.weight std = (bn.running_var + bn.eps).sqrt() return kernel * ((gamma / std).reshape(-1, 1, 1, 1)), bn.bias - bn.running_mean * gamma / std def transII_addbranch(kernels, biases): return sum(kernels), sum(biases) def transIII_1x1_kxk(k1, b1, k2, b2, groups): if groups == 1: k = F.conv2d(k2, k1.permute(1, 0, 2, 3)) # b_hat = (k2 * b1.reshape(1, -1, 1, 1)).sum((1, 2, 3)) else: k_slices = [] b_slices = [] k1_T = k1.permute(1, 0, 2, 3) k1_group_width = k1.size(0) // groups k2_group_width = k2.size(0) // groups for g in range(groups): k1_T_slice = k1_T[:, g*k1_group_width:(g+1)*k1_group_width, :, :] k2_slice = k2[g*k2_group_width:(g+1)*k2_group_width, :, :, :] k_slices.append(F.conv2d(k2_slice, k1_T_slice)) b_slices.append((k2_slice * b1[g*k1_group_width:(g+1)*k1_group_width].reshape(1, -1, 1, 1)).sum((1, 2, 3))) k, b_hat = transIV_depthconcat(k_slices, b_slices) return k, b_hat + b2 def transIV_depthconcat(kernels, biases): return torch.cat(kernels, dim=0), torch.cat(biases) def transV_avg(channels, kernel_size, groups): input_dim = channels // groups k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 return k # This has not been tested with non-square kernels (kernel.size(2) != kernel.size(3)) nor even-size kernels def transVI_multiscale(kernel, target_kernel_size): H_pixels_to_pad = (target_kernel_size - kernel.size(2)) // 2 W_pixels_to_pad = (target_kernel_size - kernel.size(3)) // 2 return F.pad(kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad]) def conv_bn(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros'): conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=False, padding_mode=padding_mode) bn_layer = nn.BatchNorm2d(num_features=out_channels, affine=True) se = nn.Sequential() se.add_module('conv', conv_layer) se.add_module('bn', bn_layer) return se class IdentityBasedConv1x1(nn.Conv2d): def __init__(self, channels, groups=1): super(IdentityBasedConv1x1, self).__init__(in_channels=channels, out_channels=channels, kernel_size=1, stride=1, padding=0, groups=groups, bias=False) assert channels % groups == 0 input_dim = channels // groups id_value = np.zeros((channels, input_dim, 1, 1)) for i in range(channels): id_value[i, i % input_dim, 0, 0] = 1 self.id_tensor = torch.from_numpy(id_value).type_as(self.weight) nn.init.zeros_(self.weight) def forward(self, input): kernel = self.weight + self.id_tensor.to(self.weight.device).type_as(self.weight) result = F.conv2d(input, kernel, None, stride=1, padding=0, dilation=self.dilation, groups=self.groups) return result def get_actual_kernel(self): return self.weight + self.id_tensor.to(self.weight.device) class BNAndPadLayer(nn.Module): def __init__(self, pad_pixels, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True): super(BNAndPadLayer, self).__init__() self.bn = nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) self.pad_pixels = pad_pixels def forward(self, input): output = self.bn(input) if self.pad_pixels > 0: if self.bn.affine: pad_values = self.bn.bias.detach() - self.bn.running_mean * self.bn.weight.detach() / torch.sqrt(self.bn.running_var + self.bn.eps) else: pad_values = - self.bn.running_mean / torch.sqrt(self.bn.running_var + self.bn.eps) output = F.pad(output, [self.pad_pixels] * 4) pad_values = pad_values.view(1, -1, 1, 1) output[:, :, 0:self.pad_pixels, :] = pad_values output[:, :, -self.pad_pixels:, :] = pad_values output[:, :, :, 0:self.pad_pixels] = pad_values output[:, :, :, -self.pad_pixels:] = pad_values return output @property def weight(self): return self.bn.weight @property def bias(self): return self.bn.bias @property def running_mean(self): return self.bn.running_mean @property def running_var(self): return self.bn.running_var @property def eps(self): return self.bn.eps class DiverseBranchBlock(nn.Module): def __init__(self, in_channels, out_channels, k, s=1, p=None, g=1, act=None, internal_channels_1x1_3x3=None, deploy=False, single_init=False): super(DiverseBranchBlock, self).__init__() self.deploy = deploy self.nonlinear = act self.kernel_size = k self.out_channels = out_channels self.groups = g if p is None: p = autopad(k, p) assert p == k // 2 if deploy: self.dbb_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=k, stride=s, padding=p, groups=g, bias=True) else: self.dbb_origin = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=k, stride=s, padding=p, groups=g) self.dbb_avg = nn.Sequential() if g < out_channels: self.dbb_avg.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, groups=g, bias=False)) self.dbb_avg.add_module('bn', BNAndPadLayer(pad_pixels=p, num_features=out_channels)) self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=k, stride=s, padding=0)) self.dbb_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=s, padding=0, groups=g) else: self.dbb_avg.add_module('avg', nn.AvgPool2d(kernel_size=k, stride=s, padding=p)) self.dbb_avg.add_module('avgbn', nn.BatchNorm2d(out_channels)) if internal_channels_1x1_3x3 is None: internal_channels_1x1_3x3 = in_channels if g < out_channels else 2 * in_channels # For mobilenet, it is better to have 2X internal channels self.dbb_1x1_kxk = nn.Sequential() if internal_channels_1x1_3x3 == in_channels: self.dbb_1x1_kxk.add_module('idconv1', IdentityBasedConv1x1(channels=in_channels, groups=g)) else: self.dbb_1x1_kxk.add_module('conv1', nn.Conv2d(in_channels=in_channels, out_channels=internal_channels_1x1_3x3, kernel_size=1, stride=1, padding=0, groups=g, bias=False)) self.dbb_1x1_kxk.add_module('bn1', BNAndPadLayer(pad_pixels=p, num_features=internal_channels_1x1_3x3, affine=True)) self.dbb_1x1_kxk.add_module('conv2', nn.Conv2d(in_channels=internal_channels_1x1_3x3, out_channels=out_channels, kernel_size=k, stride=s, padding=0, groups=g, bias=False)) self.dbb_1x1_kxk.add_module('bn2', nn.BatchNorm2d(out_channels)) # The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases. if single_init: # Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting. self.single_init() def get_equivalent_kernel_bias(self): k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight, self.dbb_origin.bn) if hasattr(self, 'dbb_1x1'): k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn) k_1x1 = transVI_multiscale(k_1x1, self.kernel_size) else: k_1x1, b_1x1 = 0, 0 if hasattr(self.dbb_1x1_kxk, 'idconv1'): k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel() else: k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first, self.dbb_1x1_kxk.bn1) k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2) k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(k_1x1_kxk_first, b_1x1_kxk_first, k_1x1_kxk_second, b_1x1_kxk_second, groups=self.groups) k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups) k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg.to(self.dbb_avg.avgbn.weight.device), self.dbb_avg.avgbn) if hasattr(self.dbb_avg, 'conv'): k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(self.dbb_avg.conv.weight, self.dbb_avg.bn) k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(k_1x1_avg_first, b_1x1_avg_first, k_1x1_avg_second, b_1x1_avg_second, groups=self.groups) else: k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second return transII_addbranch((k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged), (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged)) def switch_to_deploy(self): if hasattr(self, 'dbb_reparam'): return kernel, bias = self.get_equivalent_kernel_bias() self.dbb_reparam = nn.Conv2d(in_channels=self.dbb_origin.conv.in_channels, out_channels=self.dbb_origin.conv.out_channels, kernel_size=self.dbb_origin.conv.kernel_size, stride=self.dbb_origin.conv.stride, padding=self.dbb_origin.conv.padding, dilation=self.dbb_origin.conv.dilation, groups=self.dbb_origin.conv.groups, bias=True) self.dbb_reparam.weight.data = kernel self.dbb_reparam.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('dbb_origin') self.__delattr__('dbb_avg') if hasattr(self, 'dbb_1x1'): self.__delattr__('dbb_1x1') self.__delattr__('dbb_1x1_kxk') def forward(self, inputs): if hasattr(self, 'dbb_reparam'): return self.nonlinear(self.dbb_reparam(inputs)) out = self.dbb_origin(inputs) if hasattr(self, 'dbb_1x1'): out += self.dbb_1x1(inputs) out += self.dbb_avg(inputs) out += self.dbb_1x1_kxk(inputs) return self.nonlinear(out) def init_gamma(self, gamma_value): if hasattr(self, "dbb_origin"): torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value) if hasattr(self, "dbb_1x1"): torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value) if hasattr(self, "dbb_avg"): torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value) if hasattr(self, "dbb_1x1_kxk"): torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value) def single_init(self): self.init_gamma(0.0) if hasattr(self, "dbb_origin"): torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0) ================================================ FILE: yolo-improve/yolov7-DCN.py ================================================ class DCNv2(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, groups=1, act=True, dilation=1, deformable_groups=1): super(DCNv2, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = (stride, stride) self.padding = (autopad(kernel_size, padding), autopad(kernel_size, padding)) self.dilation = (dilation, dilation) self.groups = groups self.deformable_groups = deformable_groups self.weight = nn.Parameter( torch.empty(out_channels, in_channels, *self.kernel_size) ) self.bias = nn.Parameter(torch.empty(out_channels)) out_channels_offset_mask = (self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]) self.conv_offset_mask = nn.Conv2d( self.in_channels, out_channels_offset_mask, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True, ) self.bn = nn.BatchNorm2d(out_channels) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) self.reset_parameters() def forward(self, x): offset_mask = self.conv_offset_mask(x) o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) x = torch.ops.torchvision.deform_conv2d( x, self.weight, offset, mask, self.bias, self.stride[0], self.stride[1], self.padding[0], self.padding[1], self.dilation[0], self.dilation[1], self.groups, self.deformable_groups, True ) x = self.bn(x) x = self.act(x) return x def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k std = 1. / math.sqrt(n) self.weight.data.uniform_(-std, std) self.bias.data.zero_() self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() ================================================ FILE: yolo-improve/yolov7-DCNV3.py ================================================ from models.ops_dcnv3.modules import DCNv3 class DCNV3_YoLo(nn.Module): def __init__(self, inc, ouc, k=1, s=1, p=None, g=1, act=True): super().__init__() self.conv = Conv(inc, ouc, k=1) self.dcnv3 = DCNv3(ouc, kernel_size=k, stride=s, group=g) self.bn = nn.BatchNorm2d(ouc) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) def forward(self, x): x = self.conv(x) x = x.permute(0, 2, 3, 1) x = self.dcnv3(x) x = x.permute(0, 3, 1, 2) x = self.act(self.bn(x)) return x if isinstance(m, Detect): s = 256 # 2x min stride self.model.to(torch.device('cuda')) m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s).to(torch.device('cuda')))]).cpu() # forward self.model.cpu() check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_biases() # only run once # print('Strides: %s' % m.stride.tolist()) if isinstance(m, IDetect): s = 256 # 2x min stride self.model.to(torch.device('cuda')) m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s).to(torch.device('cuda')))]).cpu() # forward self.model.cpu() check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_biases() # only run once # print('Strides: %s' % m.stride.tolist()) if isinstance(m, IAuxDetect): s = 256 # 2x min stride self.model.to(torch.device('cuda')) m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s).to(torch.device('cuda')))[:4]]).cpu() # forward self.model.cpu() #print(m.stride) check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_aux_biases() # only run once # print('Strides: %s' % m.stride.tolist()) ================================================ FILE: yolo-improve/yolov7-DSConv.py ================================================ import torch.nn.functional as F from torch.nn.modules.conv import _ConvNd from torch.nn.modules.utils import _pair class DSConv(_ConvNd): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1, padding_mode='zeros', bias=False, block_size=32, KDSBias=False, CDS=False): padding = _pair(autopad(kernel_size, padding)) kernel_size = _pair(kernel_size) stride = _pair(stride) dilation = _pair(dilation) blck_numb = math.ceil(((in_channels)/(block_size*groups))) super(DSConv, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias, padding_mode) # KDS weight From Paper self.intweight = torch.Tensor(out_channels, in_channels, *kernel_size) self.alpha = torch.Tensor(out_channels, blck_numb, *kernel_size) # KDS bias From Paper self.KDSBias = KDSBias self.CDS = CDS if KDSBias: self.KDSb = torch.Tensor(out_channels, blck_numb, *kernel_size) if CDS: self.CDSw = torch.Tensor(out_channels) self.CDSb = torch.Tensor(out_channels) self.reset_parameters() def get_weight_res(self): # Include expansion of alpha and multiplication with weights to include in the convolution layer here alpha_res = torch.zeros(self.weight.shape).to(self.alpha.device) # Include KDSBias if self.KDSBias: KDSBias_res = torch.zeros(self.weight.shape).to(self.alpha.device) # Handy definitions: nmb_blocks = self.alpha.shape[1] total_depth = self.weight.shape[1] bs = total_depth//nmb_blocks llb = total_depth-(nmb_blocks-1)*bs # Casting the Alpha values as same tensor shape as weight for i in range(nmb_blocks): length_blk = llb if i==nmb_blocks-1 else bs shp = self.alpha.shape # Notice this is the same shape for the bias as well to_repeat=self.alpha[:, i, ...].view(shp[0],1,shp[2],shp[3]).clone() repeated = to_repeat.expand(shp[0], length_blk, shp[2], shp[3]).clone() alpha_res[:, i*bs:(i*bs+length_blk), ...] = repeated.clone() if self.KDSBias: to_repeat = self.KDSb[:, i, ...].view(shp[0], 1, shp[2], shp[3]).clone() repeated = to_repeat.expand(shp[0], length_blk, shp[2], shp[3]).clone() KDSBias_res[:, i*bs:(i*bs+length_blk), ...] = repeated.clone() if self.CDS: to_repeat = self.CDSw.view(-1, 1, 1, 1) repeated = to_repeat.expand_as(self.weight) print(repeated.shape) # Element-wise multiplication of alpha and weight weight_res = torch.mul(alpha_res, self.weight) if self.KDSBias: weight_res = torch.add(weight_res, KDSBias_res) return weight_res def forward(self, input): # Get resulting weight #weight_res = self.get_weight_res() # Returning convolution return F.conv2d(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) class DSConv2D(Conv): def __init__(self, inc, ouc, k=1, s=1, p=None, g=1, act=True): super().__init__(inc, ouc, k, s, p, g, act) self.conv = DSConv(inc, ouc, k, s, p, g) ================================================ FILE: yolo-improve/yolov7-DecoupledHead.py ================================================ class IDetect_Decoupled(nn.Module): stride = None # strides computed during build export = False # onnx export end2end = False include_nms = False concat = False def __init__(self, nc=80, anchors=(), ch=()): # detection layer super(IDetect_Decoupled, self).__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.zeros(1)] * self.nl # init grid a = torch.tensor(anchors).float().view(self.nl, -1, 2) self.register_buffer('anchors', a) # shape(nl,na,2) self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2) self.m_stem = nn.ModuleList(Conv(x, x, 1) for x in ch) # stem conv self.m_cls = nn.ModuleList(nn.Sequential(Conv(x, x, 3), nn.Conv2d(x, self.na * self.nc, 1)) for x in ch) # cls conv self.m_reg_conf = nn.ModuleList(Conv(x, x, 3) for x in ch) # reg_conf stem conv self.m_reg = nn.ModuleList(nn.Conv2d(x, self.na * 4, 1) for x in ch) # reg conv self.m_conf = nn.ModuleList(nn.Conv2d(x, self.na * 1, 1) for x in ch) # conf conv self.ia_cls = nn.ModuleList(ImplicitA(x) for x in ch) self.ia_reg = nn.ModuleList(ImplicitA(x) for x in ch) self.ia_conf = nn.ModuleList(ImplicitA(x) for x in ch) self.im_cls = nn.ModuleList(ImplicitM(self.nc * self.na) for _ in ch) self.im_reg = nn.ModuleList(ImplicitM(4 * self.na) for _ in ch) self.im_conf = nn.ModuleList(ImplicitM(1 * self.na) for _ in ch) def forward(self, x): # x = x.copy() # for profiling z = [] # inference output self.training |= self.export for i in range(self.nl): x[i] = self.m_stem[i](x[i]) # conv bs, _, ny, nx = x[i].shape x_cls = self.im_cls[i](self.m_cls[i](self.ia_cls[i](x[i]))).view(bs, self.na, self.nc, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_reg_conf = self.m_reg_conf[i](x[i]) x_reg = self.im_reg[i](self.m_reg[i](self.ia_reg[i](x_reg_conf))).view(bs, self.na, 4, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_conf = self.im_conf[i](self.m_conf[i](self.ia_conf[i](x_reg_conf))).view(bs, self.na, 1, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i] = torch.cat([x_reg, x_conf, x_cls], dim=4) if not self.training: # inference if self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh z.append(y.view(bs, -1, self.no)) return x if self.training else (torch.cat(z, 1), x) def fuseforward(self, x): # x = x.copy() # for profiling z = [] # inference output self.training |= self.export for i in range(self.nl): x[i] = self.m_stem[i](x[i]) # conv bs, _, ny, nx = x[i].shape x_cls = self.m_cls[i](x[i]).view(bs, self.na, self.nc, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_reg_conf = self.m_reg_conf[i](x[i]) x_reg = self.m_reg[i](x_reg_conf).view(bs, self.na, 4, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x_conf = self.m_conf[i](x_reg_conf).view(bs, self.na, 1, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i] = torch.cat([x_reg, x_conf, x_cls], dim=4) if not self.training: # inference if self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() if not torch.onnx.is_in_onnx_export(): y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh else: xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0 xy = xy * (2. * self.stride[i]) + (self.stride[i] * (self.grid[i] - 0.5)) # new xy wh = wh ** 2 * (4 * self.anchor_grid[i].data) # new wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, -1, self.no)) if self.training: out = x elif self.end2end: out = torch.cat(z, 1) elif self.include_nms: z = self.convert(z) out = (z, ) elif self.concat: out = torch.cat(z, 1) else: out = (torch.cat(z, 1), x) return out def fuse(self): print("IDetect.fuse") # fuse ImplicitA and Convolution for i in range(len(self.m_cls)): c1,c2,_,_ = self.m_cls[i][-1].weight.shape c1_,c2_, _,_ = self.ia_cls[i].implicit.shape self.m_cls[i][-1].bias += torch.matmul(self.m_cls[i][-1].weight.reshape(c1,c2),self.ia_cls[i].implicit.reshape(c2_,c1_)).squeeze(1) for i in range(len(self.m_reg)): c1,c2,_,_ = self.m_reg[i].weight.shape c1_,c2_, _,_ = self.ia_reg[i].implicit.shape self.m_reg[i].bias += torch.matmul(self.m_reg[i].weight.reshape(c1,c2),self.ia_reg[i].implicit.reshape(c2_,c1_)).squeeze(1) for i in range(len(self.m_conf)): c1,c2,_,_ = self.m_conf[i].weight.shape c1_,c2_, _,_ = self.ia_conf[i].implicit.shape self.m_conf[i].bias += torch.matmul(self.m_conf[i].weight.reshape(c1,c2),self.ia_conf[i].implicit.reshape(c2_,c1_)).squeeze(1) # fuse ImplicitM and Convolution for i in range(len(self.m_cls)): c1,c2, _,_ = self.im_cls[i].implicit.shape self.m_cls[i][-1].bias *= self.im_cls[i].implicit.reshape(c2) self.m_cls[i][-1].weight *= self.im_cls[i].implicit.transpose(0,1) for i in range(len(self.m_reg)): c1,c2, _,_ = self.im_reg[i].implicit.shape self.m_reg[i].bias *= self.im_reg[i].implicit.reshape(c2) self.m_reg[i].weight *= self.im_reg[i].implicit.transpose(0,1) for i in range(len(self.m_conf)): c1,c2, _,_ = self.im_conf[i].implicit.shape self.m_conf[i].bias *= self.im_conf[i].implicit.reshape(c2) self.m_conf[i].weight *= self.im_conf[i].implicit.transpose(0,1) @staticmethod def _make_grid(nx=20, ny=20): yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() def convert(self, z): z = torch.cat(z, 1) box = z[:, :, :4] conf = z[:, :, 4:5] score = z[:, :, 5:] score *= conf convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=torch.float32, device=z.device) box @= convert_matrix return (box, score) def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency # https://arxiv.org/abs/1708.02002 section 3.3 # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1. m = self.model[-1] # Detect() module if isinstance(m, IDetect): for mi, s in zip(m.m, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) b.data[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum()) # cls mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) elif isinstance(m, IDetect_Decoupled): for mi, s in zip(m.m_conf, m.stride): # from b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) for mi, s in zip(m.m_cls, m.stride): # from b = mi[-1].bias.view(m.na, -1) # conv.bias(255) to (3,85) b.data += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum()) # cls mi[-1].bias = torch.nn.Parameter(b.view(-1), requires_grad=True) if isinstance(m, IDetect_Decoupled): s = 256 # 2x min stride m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))]) # forward check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_biases() # only run once # print('Strides: %s' % m.stride.tolist()) ================================================ FILE: yolo-improve/yolov7-DySnakeConv.py ================================================ class DySnakeConv(nn.Module): def __init__(self, inc, ouc, k=3, act=True) -> None: super().__init__() self.conv_0 = Conv(inc, ouc, k, act=act) self.conv_x = DSConv(inc, ouc, 0, k, act=True) self.conv_y = DSConv(inc, ouc, 1, k, act=True) self.conv_1x1 = Conv(ouc * 3, ouc, 1, act=act) def forward(self, x): return self.conv_1x1(torch.cat([self.conv_0(x), self.conv_x(x), self.conv_y(x)], dim=1)) class DSConv(nn.Module): def __init__(self, in_ch, out_ch, morph, kernel_size=3, if_offset=True, extend_scope=1, act=True): """ The Dynamic Snake Convolution :param in_ch: input channel :param out_ch: output channel :param kernel_size: the size of kernel :param extend_scope: the range to expand (default 1 for this method) :param morph: the morphology of the convolution kernel is mainly divided into two types along the x-axis (0) and the y-axis (1) (see the paper for details) :param if_offset: whether deformation is required, if it is False, it is the standard convolution kernel """ super(DSConv, self).__init__() # use the to learn the deformable offset self.offset_conv = nn.Conv2d(in_ch, 2 * kernel_size, 3, padding=1) self.bn = nn.BatchNorm2d(2 * kernel_size) self.kernel_size = kernel_size # two types of the DSConv (along x-axis and y-axis) self.dsc_conv_x = nn.Conv2d( in_ch, out_ch, kernel_size=(kernel_size, 1), stride=(kernel_size, 1), padding=0, ) self.dsc_conv_y = nn.Conv2d( in_ch, out_ch, kernel_size=(1, kernel_size), stride=(1, kernel_size), padding=0, ) self.gn = nn.GroupNorm(out_ch // 4, out_ch) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) self.extend_scope = extend_scope self.morph = morph self.if_offset = if_offset def forward(self, f): offset = self.offset_conv(f) offset = self.bn(offset) # We need a range of deformation between -1 and 1 to mimic the snake's swing offset = torch.tanh(offset) input_shape = f.shape dsc = DSC(input_shape, self.kernel_size, self.extend_scope, self.morph) deformed_feature = dsc.deform_conv(f, offset, self.if_offset) if self.morph == 0: x = self.dsc_conv_x(deformed_feature.type(f.dtype)) x = self.gn(x) x = self.act(x) return x else: x = self.dsc_conv_y(deformed_feature.type(f.dtype)) x = self.gn(x) x = self.act(x) return x # Core code, for ease of understanding, we mark the dimensions of input and output next to the code class DSC(object): def __init__(self, input_shape, kernel_size, extend_scope, morph): self.num_points = kernel_size self.width = input_shape[2] self.height = input_shape[3] self.morph = morph self.extend_scope = extend_scope # offset (-1 ~ 1) * extend_scope # define feature map shape """ B: Batch size C: Channel W: Width H: Height """ self.num_batch = input_shape[0] self.num_channels = input_shape[1] """ input: offset [B,2*K,W,H] K: Kernel size (2*K: 2D image, deformation contains and ) output_x: [B,1,W,K*H] coordinate map output_y: [B,1,K*W,H] coordinate map """ def _coordinate_map_3D(self, offset, if_offset): device = offset.device # offset y_offset, x_offset = torch.split(offset, self.num_points, dim=1) y_center = torch.arange(0, self.width).repeat([self.height]) y_center = y_center.reshape(self.height, self.width) y_center = y_center.permute(1, 0) y_center = y_center.reshape([-1, self.width, self.height]) y_center = y_center.repeat([self.num_points, 1, 1]).float() y_center = y_center.unsqueeze(0) x_center = torch.arange(0, self.height).repeat([self.width]) x_center = x_center.reshape(self.width, self.height) x_center = x_center.permute(0, 1) x_center = x_center.reshape([-1, self.width, self.height]) x_center = x_center.repeat([self.num_points, 1, 1]).float() x_center = x_center.unsqueeze(0) if self.morph == 0: """ Initialize the kernel and flatten the kernel y: only need 0 x: -num_points//2 ~ num_points//2 (Determined by the kernel size) !!! The related PPT will be submitted later, and the PPT will contain the whole changes of each step """ y = torch.linspace(0, 0, 1) x = torch.linspace( -int(self.num_points // 2), int(self.num_points // 2), int(self.num_points), ) y, x = torch.meshgrid(y, x) y_spread = y.reshape(-1, 1) x_spread = x.reshape(-1, 1) y_grid = y_spread.repeat([1, self.width * self.height]) y_grid = y_grid.reshape([self.num_points, self.width, self.height]) y_grid = y_grid.unsqueeze(0) # [B*K*K, W,H] x_grid = x_spread.repeat([1, self.width * self.height]) x_grid = x_grid.reshape([self.num_points, self.width, self.height]) x_grid = x_grid.unsqueeze(0) # [B*K*K, W,H] y_new = y_center + y_grid x_new = x_center + x_grid y_new = y_new.repeat(self.num_batch, 1, 1, 1).to(device) x_new = x_new.repeat(self.num_batch, 1, 1, 1).to(device) y_offset_new = y_offset.detach().clone() if if_offset: y_offset = y_offset.permute(1, 0, 2, 3) y_offset_new = y_offset_new.permute(1, 0, 2, 3) center = int(self.num_points // 2) # The center position remains unchanged and the rest of the positions begin to swing # This part is quite simple. The main idea is that "offset is an iterative process" y_offset_new[center] = 0 for index in range(1, center): y_offset_new[center + index] = (y_offset_new[center + index - 1] + y_offset[center + index]) y_offset_new[center - index] = (y_offset_new[center - index + 1] + y_offset[center - index]) y_offset_new = y_offset_new.permute(1, 0, 2, 3).to(device) y_new = y_new.add(y_offset_new.mul(self.extend_scope)) y_new = y_new.reshape( [self.num_batch, self.num_points, 1, self.width, self.height]) y_new = y_new.permute(0, 3, 1, 4, 2) y_new = y_new.reshape([ self.num_batch, self.num_points * self.width, 1 * self.height ]) x_new = x_new.reshape( [self.num_batch, self.num_points, 1, self.width, self.height]) x_new = x_new.permute(0, 3, 1, 4, 2) x_new = x_new.reshape([ self.num_batch, self.num_points * self.width, 1 * self.height ]) return y_new, x_new else: """ Initialize the kernel and flatten the kernel y: -num_points//2 ~ num_points//2 (Determined by the kernel size) x: only need 0 """ y = torch.linspace( -int(self.num_points // 2), int(self.num_points // 2), int(self.num_points), ) x = torch.linspace(0, 0, 1) y, x = torch.meshgrid(y, x) y_spread = y.reshape(-1, 1) x_spread = x.reshape(-1, 1) y_grid = y_spread.repeat([1, self.width * self.height]) y_grid = y_grid.reshape([self.num_points, self.width, self.height]) y_grid = y_grid.unsqueeze(0) x_grid = x_spread.repeat([1, self.width * self.height]) x_grid = x_grid.reshape([self.num_points, self.width, self.height]) x_grid = x_grid.unsqueeze(0) y_new = y_center + y_grid x_new = x_center + x_grid y_new = y_new.repeat(self.num_batch, 1, 1, 1) x_new = x_new.repeat(self.num_batch, 1, 1, 1) y_new = y_new.to(device) x_new = x_new.to(device) x_offset_new = x_offset.detach().clone() if if_offset: x_offset = x_offset.permute(1, 0, 2, 3) x_offset_new = x_offset_new.permute(1, 0, 2, 3) center = int(self.num_points // 2) x_offset_new[center] = 0 for index in range(1, center): x_offset_new[center + index] = (x_offset_new[center + index - 1] + x_offset[center + index]) x_offset_new[center - index] = (x_offset_new[center - index + 1] + x_offset[center - index]) x_offset_new = x_offset_new.permute(1, 0, 2, 3).to(device) x_new = x_new.add(x_offset_new.mul(self.extend_scope)) y_new = y_new.reshape( [self.num_batch, 1, self.num_points, self.width, self.height]) y_new = y_new.permute(0, 3, 1, 4, 2) y_new = y_new.reshape([ self.num_batch, 1 * self.width, self.num_points * self.height ]) x_new = x_new.reshape( [self.num_batch, 1, self.num_points, self.width, self.height]) x_new = x_new.permute(0, 3, 1, 4, 2) x_new = x_new.reshape([ self.num_batch, 1 * self.width, self.num_points * self.height ]) return y_new, x_new """ input: input feature map [N,C,D,W,H];coordinate map [N,K*D,K*W,K*H] output: [N,1,K*D,K*W,K*H] deformed feature map """ def _bilinear_interpolate_3D(self, input_feature, y, x): device = input_feature.device y = y.reshape([-1]).float() x = x.reshape([-1]).float() zero = torch.zeros([]).int() max_y = self.width - 1 max_x = self.height - 1 # find 8 grid locations y0 = torch.floor(y).int() y1 = y0 + 1 x0 = torch.floor(x).int() x1 = x0 + 1 # clip out coordinates exceeding feature map volume y0 = torch.clamp(y0, zero, max_y) y1 = torch.clamp(y1, zero, max_y) x0 = torch.clamp(x0, zero, max_x) x1 = torch.clamp(x1, zero, max_x) input_feature_flat = input_feature.flatten() input_feature_flat = input_feature_flat.reshape( self.num_batch, self.num_channels, self.width, self.height) input_feature_flat = input_feature_flat.permute(0, 2, 3, 1) input_feature_flat = input_feature_flat.reshape(-1, self.num_channels) dimension = self.height * self.width base = torch.arange(self.num_batch) * dimension base = base.reshape([-1, 1]).float() repeat = torch.ones([self.num_points * self.width * self.height ]).unsqueeze(0) repeat = repeat.float() base = torch.matmul(base, repeat) base = base.reshape([-1]) base = base.to(device) base_y0 = base + y0 * self.height base_y1 = base + y1 * self.height # top rectangle of the neighbourhood volume index_a0 = base_y0 - base + x0 index_c0 = base_y0 - base + x1 # bottom rectangle of the neighbourhood volume index_a1 = base_y1 - base + x0 index_c1 = base_y1 - base + x1 # get 8 grid values value_a0 = input_feature_flat[index_a0.type(torch.int64)].to(device) value_c0 = input_feature_flat[index_c0.type(torch.int64)].to(device) value_a1 = input_feature_flat[index_a1.type(torch.int64)].to(device) value_c1 = input_feature_flat[index_c1.type(torch.int64)].to(device) # find 8 grid locations y0 = torch.floor(y).int() y1 = y0 + 1 x0 = torch.floor(x).int() x1 = x0 + 1 # clip out coordinates exceeding feature map volume y0 = torch.clamp(y0, zero, max_y + 1) y1 = torch.clamp(y1, zero, max_y + 1) x0 = torch.clamp(x0, zero, max_x + 1) x1 = torch.clamp(x1, zero, max_x + 1) x0_float = x0.float() x1_float = x1.float() y0_float = y0.float() y1_float = y1.float() vol_a0 = ((y1_float - y) * (x1_float - x)).unsqueeze(-1).to(device) vol_c0 = ((y1_float - y) * (x - x0_float)).unsqueeze(-1).to(device) vol_a1 = ((y - y0_float) * (x1_float - x)).unsqueeze(-1).to(device) vol_c1 = ((y - y0_float) * (x - x0_float)).unsqueeze(-1).to(device) outputs = (value_a0 * vol_a0 + value_c0 * vol_c0 + value_a1 * vol_a1 + value_c1 * vol_c1) if self.morph == 0: outputs = outputs.reshape([ self.num_batch, self.num_points * self.width, 1 * self.height, self.num_channels, ]) outputs = outputs.permute(0, 3, 1, 2) else: outputs = outputs.reshape([ self.num_batch, 1 * self.width, self.num_points * self.height, self.num_channels, ]) outputs = outputs.permute(0, 3, 1, 2) return outputs def deform_conv(self, input, offset, if_offset): y, x = self._coordinate_map_3D(offset, if_offset) deformed_feature = self._bilinear_interpolate_3D(input, y, x) return deformed_feature ================================================ FILE: yolo-improve/yolov7-EVC.py ================================================ # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 [-1, 1, EVCBlock, []], [[-1, -3], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 15 [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [4, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 20 [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 15], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 23 [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 9], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 26 [20, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 27-P3 [23, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 28-P4 [26, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 29-P5 [[27, 28, 29], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-MPDiou.py ================================================ def bbox_mpdiou(box1, box2, x1y1x2y2=True, mpdiou_hw=None, grid=None, eps=1e-7): # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4 box2 = box2.T box1[:2] += grid box2[:2] += grid # Get the coordinates of bounding boxes if x1y1x2y2: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] else: # transform from xywh to xyxy b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2 b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2 b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2 b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2 # Intersection area inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \ (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0) # Union Area w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps union = w1 * h1 + w2 * h2 - inter + eps iou = inter / union d1 = (b2_x1 - b1_x1) ** 2 + (b2_y1 - b1_y1) ** 2 d2 = (b2_x2 - b1_x2) ** 2 + (b2_y2 - b1_y2) ** 2 return iou - d1 / mpdiou_hw - d2 / mpdiou_hw # MPDIoU # ComputeLoss iou = bbox_mpdiou(pbox.T, tbox[i], x1y1x2y2=False, mpdiou_hw=pi.size(2) ** 2 + pi.size(3) ** 2, grid=torch.stack([gj, gi])) # iou(prediction, target) # ComputeLossOTA iou = bbox_mpdiou(pbox.T, selected_tbox, x1y1x2y2=False, mpdiou_hw=pi.size(2) ** 2 + pi.size(3) ** 2, grid=torch.stack([gj, gi])) # iou(prediction, target) ================================================ FILE: yolo-improve/yolov7-NWD.py ================================================ def wasserstein_loss(pred, target, eps=1e-7, constant=12.8): r"""`Implementation of paper `Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation `_. Code is modified from https://github.com/Zzh-tju/CIoU. Args: pred (Tensor): Predicted bboxes of format (x_center, y_center, w, h), shape (n, 4). target (Tensor): Corresponding gt bboxes, shape (n, 4). eps (float): Eps to avoid log(0). Return: Tensor: Loss tensor. """ center1 = pred[:, :2] center2 = target[:, :2] whs = center1[:, :2] - center2[:, :2] center_distance = whs[:, 0] * whs[:, 0] + whs[:, 1] * whs[:, 1] + eps # w1 = pred[:, 2] + eps h1 = pred[:, 3] + eps w2 = target[:, 2] + eps h2 = target[:, 3] + eps wh_distance = ((w1 - w2) ** 2 + (h1 - h2) ** 2) / 4 wasserstein_2 = center_distance + wh_distance return torch.exp(-torch.sqrt(wasserstein_2) / constant) nwd = wasserstein_loss(pbox, tbox[i]) iou_ratio = 0.5 lbox += (1 - iou_ratio) * (1.0 - nwd).mean() + iou_ratio * (1.0 - iou).mean() # iou loss # Objectness iou = (iou.detach() * iou_ratio + nwd.detach() * (1 - iou_ratio)).clamp(0, 1).type(tobj.dtype) ================================================ FILE: yolo-improve/yolov7-PConv.py ================================================ class PConv(nn.Module): def __init__(self, dim, ouc, n_div=4, forward='split_cat'): super().__init__() self.dim_conv3 = dim // n_div self.dim_untouched = dim - self.dim_conv3 self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, 3, 1, 1, bias=False) self.conv = Conv(dim, ouc, k=1) if forward == 'slicing': self.forward = self.forward_slicing elif forward == 'split_cat': self.forward = self.forward_split_cat else: raise NotImplementedError def forward_slicing(self, x): # only for inference x = x.clone() # !!! Keep the original input intact for the residual connection later x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :]) x = self.conv(x) return x def forward_split_cat(self, x): # for training/inference x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1) x1 = self.partial_conv3(x1) x = torch.cat((x1, x2), 1) x = self.conv(x) return x # !!!!!!!!!!!!!!!!!!!!!! yolov7-PConv.yaml # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [12,16, 19,36, 40,28] # P3/8 - [36,75, 76,55, 72,146] # P4/16 - [142,110, 192,243, 459,401] # P5/32 # yolov7 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 1, Conv, [64, 1, 1]], [-2, 1, Conv, [64, 1, 1]], [-1, 1, PConv, [64]], [-1, 1, PConv, [64]], [-1, 1, PConv, [64]], [-1, 1, PConv, [64]], [[-1, -3, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], # 11 [-1, 1, MP, []], [-1, 1, Conv, [128, 1, 1]], [-3, 1, Conv, [128, 1, 1]], [-1, 1, Conv, [128, 3, 2]], [[-1, -3], 1, Concat, [1]], # 16-P3/8 [-1, 1, Conv, [128, 1, 1]], [-2, 1, Conv, [128, 1, 1]], [-1, 1, PConv, [128]], [-1, 1, PConv, [128]], [-1, 1, PConv, [128]], [-1, 1, PConv, [128]], [[-1, -3, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [512, 1, 1]], # 24 [-1, 1, MP, []], [-1, 1, Conv, [256, 1, 1]], [-3, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [256, 3, 2]], [[-1, -3], 1, Concat, [1]], # 29-P4/16 [-1, 1, Conv, [256, 1, 1]], [-2, 1, Conv, [256, 1, 1]], [-1, 1, PConv, [256]], [-1, 1, PConv, [256]], [-1, 1, PConv, [256]], [-1, 1, PConv, [256]], [[-1, -3, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [1024, 1, 1]], # 37 [-1, 1, MP, []], [-1, 1, Conv, [512, 1, 1]], [-3, 1, Conv, [512, 1, 1]], [-1, 1, Conv, [512, 3, 2]], [[-1, -3], 1, Concat, [1]], # 42-P5/32 [-1, 1, Conv, [256, 1, 1]], [-2, 1, Conv, [256, 1, 1]], [-1, 1, PConv, [256]], [-1, 1, PConv, [256]], [-1, 1, PConv, [256]], [-1, 1, PConv, [256]], [[-1, -3, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [1024, 1, 1]], # 50 ] # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 51 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [37, 1, Conv, [256, 1, 1]], # route backbone P4 [[-1, -2], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], [-2, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], # 63 [-1, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [24, 1, Conv, [128, 1, 1]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, Conv, [128, 1, 1]], [-2, 1, Conv, [128, 1, 1]], [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [64, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [128, 1, 1]], # 75 [-1, 1, MP, []], [-1, 1, Conv, [128, 1, 1]], [-3, 1, Conv, [128, 1, 1]], [-1, 1, Conv, [128, 3, 2]], [[-1, -3, 63], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], [-2, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [-1, 1, Conv, [128, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [256, 1, 1]], # 88 [-1, 1, MP, []], [-1, 1, Conv, [256, 1, 1]], [-3, 1, Conv, [256, 1, 1]], [-1, 1, Conv, [256, 3, 2]], [[-1, -3, 51], 1, Concat, [1]], [-1, 1, Conv, [512, 1, 1]], [-2, 1, Conv, [512, 1, 1]], [-1, 1, Conv, [256, 3, 1]], [-1, 1, Conv, [256, 3, 1]], [-1, 1, Conv, [256, 3, 1]], [-1, 1, Conv, [256, 3, 1]], [[-1, -2, -3, -4, -5, -6], 1, Concat, [1]], [-1, 1, Conv, [512, 1, 1]], # 101 [75, 1, RepConv, [256, 3, 1]], [88, 1, RepConv, [512, 3, 1]], [101, 1, RepConv, [1024, 3, 1]], [[102,103,104], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-RFEM.py ================================================ class TridentBlock(nn.Module): def __init__(self, c1, c2, stride=1, c=False, e=0.5, padding=[1, 2, 3], dilate=[1, 2, 3], bias=False): super(TridentBlock, self).__init__() self.stride = stride self.c = c c_ = int(c2 * e) self.padding = padding self.dilate = dilate self.share_weightconv1 = nn.Parameter(torch.Tensor(c_, c1, 1, 1)) self.share_weightconv2 = nn.Parameter(torch.Tensor(c2, c_, 3, 3)) self.bn1 = nn.BatchNorm2d(c_) self.bn2 = nn.BatchNorm2d(c2) self.act = nn.SiLU() nn.init.kaiming_uniform_(self.share_weightconv1, nonlinearity="relu") nn.init.kaiming_uniform_(self.share_weightconv2, nonlinearity="relu") if bias: self.bias = nn.Parameter(torch.Tensor(c2)) else: self.bias = None if self.bias is not None: nn.init.constant_(self.bias, 0) def forward_for_small(self, x): residual = x out = nn.functional.conv2d(x, self.share_weightconv1, bias=self.bias) out = self.bn1(out) out = self.act(out) out = nn.functional.conv2d(out, self.share_weightconv2, bias=self.bias, stride=self.stride, padding=self.padding[0], dilation=self.dilate[0]) out = self.bn2(out) out += residual out = self.act(out) return out def forward_for_middle(self, x): residual = x out = nn.functional.conv2d(x, self.share_weightconv1, bias=self.bias) out = self.bn1(out) out = self.act(out) out = nn.functional.conv2d(out, self.share_weightconv2, bias=self.bias, stride=self.stride, padding=self.padding[1], dilation=self.dilate[1]) out = self.bn2(out) out += residual out = self.act(out) return out def forward_for_big(self, x): residual = x out = nn.functional.conv2d(x, self.share_weightconv1, bias=self.bias) out = self.bn1(out) out = self.act(out) out = nn.functional.conv2d(out, self.share_weightconv2, bias=self.bias, stride=self.stride, padding=self.padding[2], dilation=self.dilate[2]) out = self.bn2(out) out += residual out = self.act(out) return out def forward(self, x): xm = x base_feat = [] if self.c is not False: x1 = self.forward_for_small(x) x2 = self.forward_for_middle(x) x3 = self.forward_for_big(x) else: x1 = self.forward_for_small(xm[0]) x2 = self.forward_for_middle(xm[1]) x3 = self.forward_for_big(xm[2]) base_feat.append(x1) base_feat.append(x2) base_feat.append(x3) return base_feat class RFEM(nn.Module): def __init__(self, c1, c2, n=1, e=0.5, stride=1): super(RFEM, self).__init__() c = True layers = [] layers.append(TridentBlock(c1, c2, stride=stride, c=c, e=e)) c1 = c2 for i in range(1, n): layers.append(TridentBlock(c1, c2)) self.layer = nn.Sequential(*layers) self.bn = nn.BatchNorm2d(c2) self.act = nn.SiLU() def forward(self, x): out = self.layer(x) out = out[0] + out[1] + out[2] + x out = self.act(self.bn(out)) return out # Yolov7-REFM # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [12,16, 19,36, 40,28] # P3/8 - [36,75, 76,55, 72,146] # P4/16 - [142,110, 192,243, 459,401] # P5/32 # yolov7 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 1, Yolov7_E_ELAN, [256, 64]], # 4 [-1, 1, V7DownSampling, [128]], # 5-P3/8 [-1, 1, Yolov7_E_ELAN, [512, 128]], # 6 [-1, 1, V7DownSampling, [256]], # 7-P4/16 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 8 [-1, 1, V7DownSampling, [512]], # 9-P5/32 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 10 ] # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 11 [-1, 1, RFEM, [512]], # 12 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [8, 1, Conv, [256, 1, 1]], # 15 route backbone P4 [[-1, -2], 1, Concat, [1]], # 16 [-1, 1, Yolov7_E_ELAN_NECK, [256, 128]], # 17 [-1, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1]], # 20 route backbone P3 [[-1, -2], 1, Concat, [1]], # 21 [-1, 1, Yolov7_E_ELAN_NECK, [128, 64]], # 22 [[-1, 17], 1, V7DownSampling_Neck, [128]], # 23 [-1, 1, Yolov7_E_ELAN_NECK, [256, 128]], # 24 [[-1, 12], 1, V7DownSampling_Neck, [256]], # 25 [-1, 1, Yolov7_E_ELAN_NECK, [512, 256]], # 26 [22, 1, RepConv, [256, 3, 1]], # 27-P3 [24, 1, RepConv, [512, 3, 1]], # 28-P4 [26, 1, RepConv, [1024, 3, 1]], # 29-P5 [[27, 28, 29], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-RepNCSPELAN.py ================================================ class RepConvN(nn.Module): """RepConv is a basic rep-style block, including training and deploy status This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py """ default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False): super().__init__() assert k == 3 and p == 1 self.g = g self.c1 = c1 self.c2 = c2 self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() self.bn = None self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False) self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False) def forward_fuse(self, x): """Forward process""" return self.act(self.conv(x)) def forward(self, x): """Forward process""" id_out = 0 if self.bn is None else self.bn(x) return self.act(self.conv1(x) + self.conv2(x) + id_out) def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) kernelid, biasid = self._fuse_bn_tensor(self.bn) return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _avg_to_3x3_tensor(self, avgp): channels = self.c1 groups = self.g kernel_size = avgp.kernel_size input_dim = channels // groups k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2 return k def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 if isinstance(branch, Conv): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps elif isinstance(branch, nn.BatchNorm2d): if not hasattr(self, 'id_tensor'): input_dim = self.c1 // self.g kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32) for i in range(self.c1): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def fuse_convs(self): if hasattr(self, 'conv'): return kernel, bias = self.get_equivalent_kernel_bias() self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels, out_channels=self.conv1.conv.out_channels, kernel_size=self.conv1.conv.kernel_size, stride=self.conv1.conv.stride, padding=self.conv1.conv.padding, dilation=self.conv1.conv.dilation, groups=self.conv1.conv.groups, bias=True).requires_grad_(False) self.conv.weight.data = kernel self.conv.bias.data = bias for para in self.parameters(): para.detach_() self.__delattr__('conv1') self.__delattr__('conv2') if hasattr(self, 'nm'): self.__delattr__('nm') if hasattr(self, 'bn'): self.__delattr__('bn') if hasattr(self, 'id_tensor'): self.__delattr__('id_tensor') class RepNBottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5, act=True): # ch_in, ch_out, shortcut, kernels, groups, expand super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = RepConvN(c1, c_, k[0], 1, act=act) self.cv2 = Conv(c_, c2, k[1], 1, g=g, act=act) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class RepNCSP(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act=True): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1, act=act) self.cv2 = Conv(c1, c_, 1, 1, act=act) self.cv3 = Conv(2 * c_, c2, 1, act=act) # optional act=FReLU(c2) self.m = nn.Sequential(*(RepNBottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n))) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) class RepNCSPELAN4(nn.Module): # csp-elan def __init__(self, c1, c2, c3, c4, c5=1, act=True): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() self.c = c3//2 self.cv1 = Conv(c1, c3, 1, 1, act=act) self.cv2 = nn.Sequential(RepNCSP(c3//2, c4, c5, act=act), Conv(c4, c4, 3, 1, act=act)) self.cv3 = nn.Sequential(RepNCSP(c4, c4, c5, act=act), Conv(c4, c4, 3, 1, act=act)) self.cv4 = Conv(c3+(2*c4), c2, 1, 1, act=act) def forward(self, x): y = list(self.cv1(x).chunk(2, 1)) y.extend((m(y[-1])) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) def forward_split(self, x): y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) return self.cv4(torch.cat(y, 1)) # ------------------------------yolo---------------------------- if hasattr(m, 'fuse_convs'): m.fuse_convs() m.forward = m.forward_fuse # ------------------------------yolov7-tiny---------------------------------------- # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, RepNCSPELAN4, [64, 32, 32, 1, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, RepNCSPELAN4, [128, 64, 32, 1, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, RepNCSPELAN4, [256, 128, 64, 1, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, RepNCSPELAN4, [512, 256, 128, 1, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 [[-1, -2], 1, Concat, [1]], [-1, 1, RepNCSPELAN4, [128, 64, 32, 1, nn.LeakyReLU(0.1)]], # 14 [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [4, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, RepNCSPELAN4,[64, 32, 32, 1, nn.LeakyReLU(0.1)]], # 19 [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 14], 1, Concat, [1]], [-1, 1, RepNCSPELAN4, [128, 64, 32, 1, nn.LeakyReLU(0.1)]], # 22 [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 9], 1, Concat, [1]], [-1, 1, RepNCSPELAN4, [256, 128, 64, 1, nn.LeakyReLU(0.1)]], # 25 [19, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 26-P3 [22, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 27-P4 [25, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 28-P5 [[26, 27, 28], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] # -----------------------------yolov7-------------------------------- # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [12,16, 19,36, 40,28] # P3/8 - [36,75, 76,55, 72,146] # P4/16 - [142,110, 192,243, 459,401] # P5/32 # yolov7 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 4 [-1, 1, V7DownSampling, [128]], # 5-P3/8 [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 6 [-1, 1, V7DownSampling, [256]], # 7-P4/16 [-1, 1, RepNCSPELAN4, [1024, 512, 256, 1]], # 8 [-1, 1, V7DownSampling, [512]], # 9-P5/32 [-1, 1, RepNCSPELAN4, [1024, 512, 256, 1]], # 10 ] # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 11 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [8, 1, Conv, [256, 1, 1]], # 14 route backbone P4 [[-1, -2], 1, Concat, [1]], # 15 [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 16 [-1, 1, Conv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1]], # 19 route backbone P3 [[-1, -2], 1, Concat, [1]], # 20 [-1, 1, RepNCSPELAN4, [128, 64, 32, 1]], # 21 [[-1, 16], 1, V7DownSampling_Neck, [128]], # 22 [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 23 [[-1, 11], 1, V7DownSampling_Neck, [256]], # 24 [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 25 [21, 1, RepConv, [256, 3, 1]], # 26-P3 [23, 1, RepConv, [512, 3, 1]], # 27-P4 [25, 1, RepConv, [1024, 3, 1]], # 28-P5 [[26, 27, 28], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-SAConv.py ================================================ class ConvAWS2d(nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super().__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.register_buffer('weight_gamma', torch.ones(self.out_channels, 1, 1, 1)) self.register_buffer('weight_beta', torch.zeros(self.out_channels, 1, 1, 1)) def _get_weight(self, weight): weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) weight = weight - weight_mean std = torch.sqrt(weight.view(weight.size(0), -1).var(dim=1) + 1e-5).view(-1, 1, 1, 1) weight = weight / std weight = self.weight_gamma * weight + self.weight_beta return weight def forward(self, x): weight = self._get_weight(self.weight) return super()._conv_forward(x, weight, None) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): self.weight_gamma.data.fill_(-1) super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) if self.weight_gamma.data.mean() > 0: return weight = self.weight.data weight_mean = weight.data.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) self.weight_beta.data.copy_(weight_mean) std = torch.sqrt(weight.view(weight.size(0), -1).var(dim=1) + 1e-5).view(-1, 1, 1, 1) self.weight_gamma.data.copy_(std) class SAConv2d(ConvAWS2d): def __init__(self, in_channels, out_channels, kernel_size, s=1, p=None, g=1, d=1, act=True, bias=True): super().__init__( in_channels, out_channels, kernel_size, stride=s, padding=autopad(kernel_size, p), dilation=d, groups=g, bias=bias) self.switch = torch.nn.Conv2d( self.in_channels, 1, kernel_size=1, stride=s, bias=True) self.switch.weight.data.fill_(0) self.switch.bias.data.fill_(1) self.weight_diff = torch.nn.Parameter(torch.Tensor(self.weight.size())) self.weight_diff.data.zero_() self.pre_context = torch.nn.Conv2d( self.in_channels, self.in_channels, kernel_size=1, bias=True) self.pre_context.weight.data.fill_(0) self.pre_context.bias.data.fill_(0) self.post_context = torch.nn.Conv2d( self.out_channels, self.out_channels, kernel_size=1, bias=True) self.post_context.weight.data.fill_(0) self.post_context.bias.data.fill_(0) self.bn = nn.BatchNorm2d(out_channels) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) def forward(self, x): # pre-context avg_x = torch.nn.functional.adaptive_avg_pool2d(x, output_size=1) avg_x = self.pre_context(avg_x) avg_x = avg_x.expand_as(x) x = x + avg_x # switch avg_x = torch.nn.functional.pad(x, pad=(2, 2, 2, 2), mode="reflect") avg_x = torch.nn.functional.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0) switch = self.switch(avg_x) # sac weight = self._get_weight(self.weight) out_s = super()._conv_forward(x, weight, None) ori_p = self.padding ori_d = self.dilation self.padding = tuple(3 * p for p in self.padding) self.dilation = tuple(3 * d for d in self.dilation) weight = weight + self.weight_diff out_l = super()._conv_forward(x, weight, None) out = switch * out_s + (1 - switch) * out_l self.padding = ori_p self.dilation = ori_d # post-context avg_x = torch.nn.functional.adaptive_avg_pool2d(out, output_size=1) avg_x = self.post_context(avg_x) avg_x = avg_x.expand_as(out) out = out + avg_x return self.act(self.bn(out)) ================================================ FILE: yolo-improve/yolov7-asf.py ================================================ import torch.nn.functional as F class Zoom_cat(nn.Module): def __init__(self): super().__init__() def forward(self, x): """l,m,s表示大中小三个尺度,最终会被整合到m这个尺度上""" l, m, s = x[0], x[1], x[2] tgt_size = m.shape[2:] l = F.adaptive_max_pool2d(l, tgt_size) + F.adaptive_avg_pool2d(l, tgt_size) s = F.interpolate(s, m.shape[2:], mode='nearest') lms = torch.cat([l, m, s], dim=1) return lms class ScalSeq(nn.Module): def __init__(self, inc, channel): super(ScalSeq, self).__init__() self.conv0 = Conv(inc[0], channel, 1) self.conv1 = Conv(inc[1], channel,1) self.conv2 = Conv(inc[2], channel,1) self.conv3d = nn.Conv3d(channel,channel,kernel_size=(1,1,1)) self.bn = nn.BatchNorm3d(channel) self.act = nn.LeakyReLU(0.1) self.pool_3d = nn.MaxPool3d(kernel_size=(3,1,1)) def forward(self, x): p3, p4, p5 = x[0],x[1],x[2] p3 = self.conv0(p3) p4_2 = self.conv1(p4) p4_2 = F.interpolate(p4_2, p3.size()[2:], mode='nearest') p5_2 = self.conv2(p5) p5_2 = F.interpolate(p5_2, p3.size()[2:], mode='nearest') p3_3d = torch.unsqueeze(p3, -3) p4_3d = torch.unsqueeze(p4_2, -3) p5_3d = torch.unsqueeze(p5_2, -3) combine = torch.cat([p3_3d,p4_3d,p5_3d],dim = 2) conv_3d = self.conv3d(combine) bn = self.bn(conv_3d) act = self.act(bn) x = self.pool_3d(act) x = torch.squeeze(x, 2) return x class Add(nn.Module): # Concatenate a list of tensors along dimension def __init__(self): super().__init__() def forward(self, x): input1,input2 = x[0],x[1] x = input1 + input2 return x class channel_att(nn.Module): def __init__(self, channel, b=1, gamma=2): super(channel_att, self).__init__() kernel_size = int(abs((math.log(channel, 2) + b) / gamma)) kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1 self.avg_pool = nn.AdaptiveAvgPool2d(1) self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False) self.sigmoid = nn.Sigmoid() def forward(self, x): y = self.avg_pool(x) y = y.squeeze(-1) y = y.transpose(-1, -2) y = self.conv(y).transpose(-1, -2).unsqueeze(-1) y = self.sigmoid(y) return x * y.expand_as(x) class local_att(nn.Module): def __init__(self, channel, reduction=16): super(local_att, self).__init__() self.conv_1x1 = nn.Conv2d(in_channels=channel, out_channels=channel//reduction, kernel_size=1, stride=1, bias=False) self.relu = nn.ReLU() self.bn = nn.BatchNorm2d(channel//reduction) self.F_h = nn.Conv2d(in_channels=channel//reduction, out_channels=channel, kernel_size=1, stride=1, bias=False) self.F_w = nn.Conv2d(in_channels=channel//reduction, out_channels=channel, kernel_size=1, stride=1, bias=False) self.sigmoid_h = nn.Sigmoid() self.sigmoid_w = nn.Sigmoid() def forward(self, x): _, _, h, w = x.size() x_h = torch.mean(x, dim = 3, keepdim = True).permute(0, 1, 3, 2) x_w = torch.mean(x, dim = 2, keepdim = True) x_cat_conv_relu = self.relu(self.bn(self.conv_1x1(torch.cat((x_h, x_w), 3)))) x_cat_conv_split_h, x_cat_conv_split_w = x_cat_conv_relu.split([h, w], 3) s_h = self.sigmoid_h(self.F_h(x_cat_conv_split_h.permute(0, 1, 3, 2))) s_w = self.sigmoid_w(self.F_w(x_cat_conv_split_w)) out = x * s_h.expand_as(x) * s_w.expand_as(x) return out class attention_model(nn.Module): # Concatenate a list of tensors along dimension def __init__(self, ch = 256): super().__init__() self.channel_att = channel_att(ch) self.local_att = local_att(ch) def forward(self, x): input1,input2 = x[0],x[1] input1 = self.channel_att(input1) x = input1 + input2 x = self.local_att(x) return x elif m is Zoom_cat: c2 = sum(ch[x] for x in f) elif m is Add: c2 = ch[f[-1]] elif m is attention_model: c2 = ch[f[-1]] args = [c2] elif m is ScalSeq: c1 = [ch[x] for x in f] c2 = make_divisible(args[0] * gw, 8) args = [c1, c2] ##################################################### YOLOV7-TINY ##################################################### # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [4, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [[-1, 6, -2], 1, Zoom_cat, []], # route backbone P4 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 13 [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [2, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 15 [[-1, 4, -2], 1, Zoom_cat, []], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 17 [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 18 [[-1, 13], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 20 [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 9], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 23 [[4, 6, 8], 1, ScalSeq, [64]], #24 args[inchane] [[17, -1], 1, attention_model, []], #25 [25, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 26-P3 [23, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 27-P4 [20, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 28-P5 [[26,27,28], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ##################################################### YOLOV7 ##################################################### # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [12,16, 19,36, 40,28] # P3/8 - [36,75, 76,55, 72,146] # P4/16 - [142,110, 192,243, 459,401] # P5/32 # yolov7 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 1, Yolov7_E_ELAN, [256, 64]], # 4 [-1, 1, V7DownSampling, [128]], # 5-P3/8 [-1, 1, Yolov7_E_ELAN, [512, 128]], # 6 [-1, 1, V7DownSampling, [256]], # 7-P4/16 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 8 [-1, 1, V7DownSampling, [512]], # 9-P5/32 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 10 ] # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 11 [-1, 1, Conv, [1024, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [6, 1, Conv, [1024, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [[-1, 8, -2], 1, Zoom_cat, []], # route backbone P4 [-1, 1, Yolov7_E_ELAN_NECK, [256, 128]], # 15 [-1, 1, Conv, [512, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [4, 1, Conv, [512, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 17 [[-1, 6, -2], 1, Zoom_cat, []], # 18 [-1, 1, Yolov7_E_ELAN_NECK, [128, 64]], # 19 [[-1, 15], 1, V7DownSampling_Neck, [128]], # 20 [-1, 1, Yolov7_E_ELAN_NECK, [256, 128]], # 21 [[-1, 11], 1, V7DownSampling_Neck, [256]], # 22 [-1, 1, Yolov7_E_ELAN_NECK, [512, 256]], # 23 [[6, 8, 10], 1, ScalSeq, [128]], #24 args[inchane] [[19, -1], 1, attention_model, []], #25 [25, 1, RepConv, [256, 3, 1]], # 26-P3 [21, 1, RepConv, [512, 3, 1]], # 27-P4 [23, 1, RepConv, [1024, 3, 1]], # 28-P5 [[26, 27, 28], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-head/yolov7-tiny-5-heads.yaml ================================================ # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: 3 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 14 [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [4, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 19 [-1, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [2, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P2 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 24 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 19], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 27 [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 14], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 30 [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 9], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 33 [24, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 34-P2 [27, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 35-P3 [30, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 36-P4 [33, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 37-P5 [33, 1, MP, []], # 38-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 39 [[34, 35, 36, 37, 39], 1, IDetect, [nc, anchors]], # Detect(P2, P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov7-head/yolov7-tiny-P2.yaml ================================================ # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: 3 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 14 [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [4, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 19 [-1, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [2, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P2 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 24 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 19], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 27 [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 14], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 30 [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 9], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 33 [24, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 34-P2 [27, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 35-P3 [30, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 36-P4 [33, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 37-P5 [[34, 35, 36, 37], 1, IDetect, [nc, anchors]], # Detect(P2, P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-head/yolov7-tiny-P6.yaml ================================================ # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: 3 # yolov7-tiny backbone backbone: # [from, number, module, args] c2, k=1, s=1, p=None, g=1, act=True [[-1, 1, Conv, [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 0-P1/2 [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 2 [-1, 1, MP, []], # 3-P3/8 [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 4 [-1, 1, MP, []], # 5-P4/16 [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 6 [-1, 1, MP, []], # 7-P5/32 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 8 ] # yolov7-tiny head head: [[-1, 1, Yolov7_Tiny_SPP, [256, nn.LeakyReLU(0.1)]], # 9-Yolov7-tiny-spp [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 14 [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [4, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 [[-1, -2], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [64, 32, nn.LeakyReLU(0.1)]], # 19 [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 14], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [128, 64, nn.LeakyReLU(0.1)]], # 22 [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]], [[-1, 9], 1, Concat, [1]], [-1, 1, Yolov7_Tiny_E_ELAN, [256, 128, nn.LeakyReLU(0.1)]], # 25 [19, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 26-P3 [22, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 27-P4 [25, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], # 28-P5 [25, 1, MP, []], # 29-P6/64 [-1, 1, Yolov7_Tiny_E_ELAN, [512, 256, nn.LeakyReLU(0.1)]], # 30 [[26, 27, 28, 30], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5, P6) ] ================================================ FILE: yolo-improve/yolov7-iou.py ================================================ import numpy as np import torch, math class WIoU_Scale: ''' monotonous: { None: origin v1 True: monotonic FM v2 False: non-monotonic FM v3 } momentum: The momentum of running mean''' iou_mean = 1. monotonous = False _momentum = 1 - 0.5 ** (1 / 7000) _is_train = True def __init__(self, iou): self.iou = iou self._update(self) @classmethod def _update(cls, self): if cls._is_train: cls.iou_mean = (1 - cls._momentum) * cls.iou_mean + \ cls._momentum * self.iou.detach().mean().item() @classmethod def _scaled_loss(cls, self, gamma=1.9, delta=3): if isinstance(self.monotonous, bool): if self.monotonous: return (self.iou.detach() / self.iou_mean).sqrt() else: beta = self.iou.detach() / self.iou_mean alpha = delta * torch.pow(gamma, beta - delta) return beta / alpha return 1 def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, SIoU=False, EIoU=False, WIoU=False, Focal=False, alpha=1, gamma=0.5, scale=False, eps=1e-7): # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4 box2 = box2.T # Get the coordinates of bounding boxes if x1y1x2y2: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] else: # transform from xywh to xyxy b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2 b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2 b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2 b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2 # Intersection area inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \ (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0) # Union Area w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps union = w1 * h1 + w2 * h2 - inter + eps if scale: self = WIoU_Scale(1 - (inter / union)) # IoU # iou = inter / union # ori iou iou = torch.pow(inter/(union + eps), alpha) # alpha iou if CIoU or DIoU or GIoU or EIoU or SIoU or WIoU: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU or EIoU or SIoU or WIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = (cw ** 2 + ch ** 2) ** alpha + eps # convex diagonal squared rho2 = (((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4) ** alpha # center dist ** 2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) with torch.no_grad(): alpha_ciou = v / (v - iou + (1 + eps)) if Focal: return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)), torch.pow(inter/(union + eps), gamma) # Focal_CIoU else: return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)) # CIoU elif EIoU: rho_w2 = ((b2_x2 - b2_x1) - (b1_x2 - b1_x1)) ** 2 rho_h2 = ((b2_y2 - b2_y1) - (b1_y2 - b1_y1)) ** 2 cw2 = torch.pow(cw ** 2 + eps, alpha) ch2 = torch.pow(ch ** 2 + eps, alpha) if Focal: return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2), torch.pow(inter/(union + eps), gamma) # Focal_EIou else: return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2) # EIou elif SIoU: # SIoU Loss https://arxiv.org/pdf/2205.12740.pdf s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5) sin_alpha_1 = torch.abs(s_cw) / sigma sin_alpha_2 = torch.abs(s_ch) / sigma threshold = pow(2, 0.5) / 2 sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1) angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) rho_x = (s_cw / cw) ** 2 rho_y = (s_ch / ch) ** 2 gamma = angle_cost - 2 distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y) omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4) if Focal: return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha), torch.pow(inter/(union + eps), gamma) # Focal_SIou else: return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha) # SIou elif WIoU: if Focal: raise RuntimeError("WIoU do not support Focal.") elif scale: return getattr(WIoU_Scale, '_scaled_loss')(self), (1 - iou) * torch.exp((rho2 / c2)), iou # WIoU https://arxiv.org/abs/2301.10051 else: return iou, torch.exp((rho2 / c2)) # WIoU v1 if Focal: return iou - rho2 / c2, torch.pow(inter/(union + eps), gamma) # Focal_DIoU else: return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area if Focal: return iou - torch.pow((c_area - union) / c_area + eps, alpha), torch.pow(inter/(union + eps), gamma) # Focal_GIoU https://arxiv.org/pdf/1902.09630.pdf else: return iou - torch.pow((c_area - union) / c_area + eps, alpha) # GIoU https://arxiv.org/pdf/1902.09630.pdf if Focal: return iou, torch.pow(inter/(union + eps), gamma) # Focal_IoU else: return iou # IoU ### yolov7 if type(iou) is tuple: if len(iou) == 2: lbox += (iou[1].detach() * (1 - iou[0])).mean() iou = iou[0] else: lbox += (iou[0] * iou[1]).mean() iou = iou[-1] else: lbox += (1.0 - iou).mean() # iou loss ================================================ FILE: yolo-improve/yolov7-odconv.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torch.autograd from models.common import Conv, autopad class Attention(nn.Module): def __init__(self, in_planes, out_planes, kernel_size, groups=1, reduction=0.0625, kernel_num=4, min_channel=16): super(Attention, self).__init__() attention_channel = max(int(in_planes * reduction), min_channel) self.kernel_size = kernel_size self.kernel_num = kernel_num self.temperature = 1.0 self.avgpool = nn.AdaptiveAvgPool2d(1) self.fc = Conv(in_planes, attention_channel, act=nn.ReLU(inplace=True)) self.channel_fc = nn.Conv2d(attention_channel, in_planes, 1, bias=True) self.func_channel = self.get_channel_attention if in_planes == groups and in_planes == out_planes: # depth-wise convolution self.func_filter = self.skip else: self.filter_fc = nn.Conv2d(attention_channel, out_planes, 1, bias=True) self.func_filter = self.get_filter_attention if kernel_size == 1: # point-wise convolution self.func_spatial = self.skip else: self.spatial_fc = nn.Conv2d(attention_channel, kernel_size * kernel_size, 1, bias=True) self.func_spatial = self.get_spatial_attention if kernel_num == 1: self.func_kernel = self.skip else: self.kernel_fc = nn.Conv2d(attention_channel, kernel_num, 1, bias=True) self.func_kernel = self.get_kernel_attention self._initialize_weights() def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') if m.bias is not None: nn.init.constant_(m.bias, 0) if isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def update_temperature(self, temperature): self.temperature = temperature @staticmethod def skip(_): return 1.0 def get_channel_attention(self, x): channel_attention = torch.sigmoid(self.channel_fc(x).view(x.size(0), -1, 1, 1) / self.temperature) return channel_attention def get_filter_attention(self, x): filter_attention = torch.sigmoid(self.filter_fc(x).view(x.size(0), -1, 1, 1) / self.temperature) return filter_attention def get_spatial_attention(self, x): spatial_attention = self.spatial_fc(x).view(x.size(0), 1, 1, 1, self.kernel_size, self.kernel_size) spatial_attention = torch.sigmoid(spatial_attention / self.temperature) return spatial_attention def get_kernel_attention(self, x): kernel_attention = self.kernel_fc(x).view(x.size(0), -1, 1, 1, 1, 1) kernel_attention = F.softmax(kernel_attention / self.temperature, dim=1) return kernel_attention def forward(self, x): x = self.avgpool(x) x = self.fc(x) return self.func_channel(x), self.func_filter(x), self.func_spatial(x), self.func_kernel(x) class ODConv2d(nn.Module): def __init__(self, in_planes, out_planes, k, s=1, p=None, g=1, act=True, d=1, reduction=0.0625, kernel_num=1): super(ODConv2d, self).__init__() self.in_planes = in_planes self.out_planes = out_planes self.kernel_size = k self.stride = s self.padding = autopad(k, p) self.dilation = d self.groups = g self.kernel_num = kernel_num self.attention = Attention(in_planes, out_planes, k, groups=g, reduction=reduction, kernel_num=kernel_num) self.weight = nn.Parameter(torch.randn(kernel_num, out_planes, in_planes//g, k, k), requires_grad=True) self._initialize_weights() self.bn = nn.BatchNorm2d(out_planes) self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) if self.kernel_size == 1 and self.kernel_num == 1: self._forward_impl = self._forward_impl_pw1x else: self._forward_impl = self._forward_impl_common def _initialize_weights(self): for i in range(self.kernel_num): nn.init.kaiming_normal_(self.weight[i], mode='fan_out', nonlinearity='relu') def update_temperature(self, temperature): self.attention.update_temperature(temperature) def _forward_impl_common(self, x): # Multiplying channel attention (or filter attention) to weights and feature maps are equivalent, # while we observe that when using the latter method the models will run faster with less gpu memory cost. channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x) batch_size, in_planes, height, width = x.size() x = x * channel_attention x = x.reshape(1, -1, height, width) aggregate_weight = spatial_attention * kernel_attention * self.weight.unsqueeze(dim=0) aggregate_weight = torch.sum(aggregate_weight, dim=1).view( [-1, self.in_planes // self.groups, self.kernel_size, self.kernel_size]) output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups * batch_size) output = output.view(batch_size, self.out_planes, output.size(-2), output.size(-1)) output = output * filter_attention return output def _forward_impl_pw1x(self, x): channel_attention, filter_attention, spatial_attention, kernel_attention = self.attention(x) x = x * channel_attention output = F.conv2d(x, weight=self.weight.squeeze(dim=0), bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups) output = output * filter_attention return output def forward(self, x): return self.act(self.bn(self._forward_impl(x))) ================================================ FILE: yolo-improve/yolov7-slimneck.py ================================================ class GSConv(nn.Module): # GSConv https://github.com/AlanLi1997/slim-neck-by-gsconv # act参数在yolov7-tiny上记得修改为nn.LeakyReLU(0.1) def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): super().__init__() c_ = c2 // 2 self.cv1 = Conv(c1, c_, k, s, p, g, act) self.cv2 = Conv(c_, c_, 5, 1, p, c_, act) def forward(self, x): x1 = self.cv1(x) x2 = torch.cat((x1, self.cv2(x1)), 1) # shuffle # y = x2.reshape(x2.shape[0], 2, x2.shape[1] // 2, x2.shape[2], x2.shape[3]) # y = y.permute(0, 2, 1, 3, 4) # return y.reshape(y.shape[0], -1, y.shape[3], y.shape[4]) b, n, h, w = x2.size() b_n = b * n // 2 y = x2.reshape(b_n, 2, h * w) y = y.permute(1, 0, 2) y = y.reshape(2, -1, n // 2, h, w) return torch.cat((y[0], y[1]), 1) class GSBottleneck(nn.Module): # GS Bottleneck https://github.com/AlanLi1997/slim-neck-by-gsconv def __init__(self, c1, c2, k=3, s=1, e=0.5): super().__init__() c_ = int(c2*e) # for lighting self.conv_lighting = nn.Sequential( GSConv(c1, c_, 1, 1), GSConv(c_, c2, 3, 1, act=False)) self.shortcut = Conv(c1, c2, 1, 1, act=False) def forward(self, x): return self.conv_lighting(x) + self.shortcut(x) class GSBottleneckC(GSBottleneck): # cheap GS Bottleneck https://github.com/AlanLi1997/slim-neck-by-gsconv def __init__(self, c1, c2, k=3, s=1): super().__init__(c1, c2, k, s) self.shortcut = DWConv(c1, c2, k, s, act=False) class VoVGSCSP(nn.Module): # VoVGSCSP module with GSBottleneck def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.gsb = nn.Sequential(*(GSBottleneck(c_, c_, e=1.0) for _ in range(n))) self.res = Conv(c_, c_, 3, 1, act=False) self.cv3 = Conv(2 * c_, c2, 1) # def forward(self, x): x1 = self.gsb(self.cv1(x)) y = self.cv2(x) return self.cv3(torch.cat((y, x1), dim=1)) class VoVGSCSPC(VoVGSCSP): # cheap VoVGSCSP module with GSBottleneck def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): super().__init__(c1, c2) c_ = int(c2 * 0.5) # hidden channels self.gsb = GSBottleneckC(c_, c_, 1, 1) # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple # anchors anchors: - [12,16, 19,36, 40,28] # P3/8 - [36,75, 76,55, 72,146] # P4/16 - [142,110, 192,243, 459,401] # P5/32 # yolov7 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [32, 3, 1]], # 0 [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 [-1, 1, Conv, [64, 3, 1]], [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 [-1, 1, Yolov7_E_ELAN, [256, 64]], # 4 [-1, 1, V7DownSampling, [128]], # 5-P3/8 [-1, 1, Yolov7_E_ELAN, [512, 128]], # 6 [-1, 1, V7DownSampling, [256]], # 7-P4/16 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 8 [-1, 1, V7DownSampling, [512]], # 9-P5/32 [-1, 1, Yolov7_E_ELAN, [1024, 256]], # 10 ] # yolov7 head head: [[-1, 1, SPPCSPC, [512]], # 11 [-1, 1, GSConv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [8, 1, GSConv, [256, 1, 1]], # 14 route backbone P4 [[-1, -2], 1, Concat, [1]], # 15 [-1, 1, VoVGSCSP, [256]], # 16 [-1, 1, GSConv, [128, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [6, 1, GSConv, [128, 1, 1]], # 19 route backbone P3 [[-1, -2], 1, Concat, [1]], # 20 [-1, 1, VoVGSCSP, [128]], # 21 [[-1, 16], 1, V7DownSampling_Neck, [128]], # 22 [-1, 1, VoVGSCSP, [256]], # 23 [[-1, 11], 1, V7DownSampling_Neck, [256]], # 24 [-1, 1, VoVGSCSP, [512]], # 25 [21, 1, RepConv, [256, 3, 1]], # 26-P3 [23, 1, RepConv, [512, 3, 1]], # 27-P4 [25, 1, RepConv, [1024, 3, 1]], # 28-P5 [[26, 27, 28], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ] ================================================ FILE: yolo-improve/yolov7-softnms.py ================================================ def box_iou_for_nms(box1, box2, GIoU=False, DIoU=False, CIoU=False, SIoU=False, EIou=False, eps=1e-7): # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clamp(eps) w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clamp(eps) # Intersection area inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \ (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0) # Union Area union = w1 * h1 + w2 * h2 - inter + eps # IoU iou = inter / union if CIoU or DIoU or GIoU or EIou: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU or EIou: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) with torch.no_grad(): alpha = v / (v - iou + (1 + eps)) return iou - (rho2 / c2 + v * alpha) # CIoU elif EIou: rho_w2 = ((b2_x2 - b2_x1) - (b1_x2 - b1_x1)) ** 2 rho_h2 = ((b2_y2 - b2_y1) - (b1_y2 - b1_y1)) ** 2 cw2 = cw ** 2 + eps ch2 = ch ** 2 + eps return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2) return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf elif SIoU: # SIoU Loss https://arxiv.org/pdf/2205.12740.pdf s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5) sin_alpha_1 = torch.abs(s_cw) / sigma sin_alpha_2 = torch.abs(s_ch) / sigma threshold = pow(2, 0.5) / 2 sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1) angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) rho_x = (s_cw / cw) ** 2 rho_y = (s_ch / ch) ** 2 gamma = angle_cost - 2 distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y) omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4) return iou - 0.5 * (distance_cost + shape_cost) return iou # IoU def soft_nms(bboxes, scores, iou_thresh=0.5,sigma=0.5,score_threshold=0.25): order = scores.argsort(descending=True).to(bboxes.device) keep = [] while order.numel() > 1: if order.numel() == 1: keep.append(order[0]) break else: i = order[0] keep.append(i) iou = box_iou_for_nms(bboxes[i], bboxes[order[1:]]).squeeze() idx = (iou > iou_thresh).nonzero().squeeze() if idx.numel() > 0: iou = iou[idx] newScores = torch.exp(-torch.pow(iou,2)/sigma) scores[order[idx+1]] *= newScores newOrder = (scores[order[1:]] > score_threshold).nonzero().squeeze() if newOrder.numel() == 0: break else: maxScoreIndex = torch.argmax(scores[order[newOrder+1]]) if maxScoreIndex != 0: newOrder[[0,maxScoreIndex],] = newOrder[[maxScoreIndex,0],] order = order[newOrder+1] return torch.LongTensor(keep) ================================================ FILE: yolo-improve/yolov8-DCN.py ================================================ class DCNv2(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, dilation=1, groups=1, deformable_groups=1): super(DCNv2, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = (kernel_size, kernel_size) self.stride = (stride, stride) self.padding = (padding, padding) self.dilation = (dilation, dilation) self.groups = groups self.deformable_groups = deformable_groups self.weight = nn.Parameter( torch.empty(out_channels, in_channels, *self.kernel_size) ) self.bias = nn.Parameter(torch.empty(out_channels)) out_channels_offset_mask = (self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]) self.conv_offset_mask = nn.Conv2d( self.in_channels, out_channels_offset_mask, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, bias=True, ) self.bn = nn.BatchNorm2d(out_channels) self.act = Conv.default_act self.reset_parameters() def forward(self, x): offset_mask = self.conv_offset_mask(x) o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) x = torch.ops.torchvision.deform_conv2d( x, self.weight, offset, mask, self.bias, self.stride[0], self.stride[1], self.padding[0], self.padding[1], self.dilation[0], self.dilation[1], self.groups, self.deformable_groups, True ) x = self.bn(x) x = self.act(x) return x def reset_parameters(self): n = self.in_channels for k in self.kernel_size: n *= k std = 1. / math.sqrt(n) self.weight.data.uniform_(-std, std) self.bias.data.zero_() self.conv_offset_mask.weight.data.zero_() self.conv_offset_mask.bias.data.zero_() class Bottleneck_DCN(nn.Module): # Standard bottleneck with DCN def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): # ch_in, ch_out, shortcut, groups, kernels, expand super().__init__() c_ = int(c2 * e) # hidden channels if k[0] == 3: self.cv1 = DCNv2(c1, c_, k[0], 1) else: self.cv1 = Conv(c1, c_, k[0], 1) if k[1] == 3: self.cv2 = DCNv2(c_, c2, k[1], 1, groups=g) else: self.cv2 = Conv(c_, c2, k[1], 1, g=g) self.add = shortcut and c1 == c2 def forward(self, x): return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C2f_DCN(nn.Module): # CSP Bottleneck with 2 convolutions def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion super().__init__() self.c = int(c2 * e) # hidden channels self.cv1 = Conv(c1, 2 * self.c, 1, 1) self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2) self.m = nn.ModuleList(Bottleneck_DCN(self.c, self.c, shortcut, g, k=(3, 3), e=1.0) for _ in range(n)) def forward(self, x): y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in self.m) return self.cv2(torch.cat(y, 1)) ================================================ FILE: yolo-improve/yolov8-compress.md ================================================ # YOLOV8V10V11剪枝项目介绍 ## 对于群里的剪枝相关问题,我基本都会回复,对于一些剪枝问题,我都会给出建议。 ### 首先剪枝是什么? 模型剪枝是深度学习中的一种技术,旨在通过减少神经网络中不必要的参数和连接,来优化模型的效率和性能。模型剪枝可以分为结构剪枝和参数剪枝两种类型。 ### 为什么需要剪枝? 剪枝可以很好地衡量模型轻量化程度与精度的关系,是替换轻量化结构完全没办法比的,比如我模型剪枝可以压缩百分之30的计算量,精度只下降了百分之1,但是你通过换模块来达到压缩百分之30的计算量,一般时间就会变长,因为大部分轻量化模块都是由时间换空间,而且精度还会下降得比较多,但是剪枝可以很好地避免这个问题. ### 目前剪枝项目包含以下剪枝方法: 1. L1 2. Random 3. Slim(需要稀疏训练) 4. GroupSlim(需要稀疏训练) 5. GroupNorm 6. LAMP 7. GroupSL(需要稀疏训练) 8. GroupReg(需要稀疏训练) 9. GroupHessian 10. GroupTaylor ### 其中prune系列还有一些细节: 1. 支持稀疏训练时候可视化BN稀疏程度和数值。 2. 稀疏训练的稀疏系数会进行线性调整,让稀疏训练后期精度更容易回升,更稳定。 3. 支持设定加速比例,模型会进行自动压缩,压缩到指定比例或者达到最大压缩次数后会自动进入finetune。 ### 剪枝的一些顾虑 大家关心最多的一个问题就是,我的结构能不能剪之类的,剪枝对模型复杂度的要求比较高,目前剪枝都是基于Torch_Pruning库进行剪枝,prune系列的可以跳过一些不能剪枝的层(某些复杂的结构可能在构建动态图的时候失败,这些就只能换结构),这个项目会有比较多的示例和视频教程教大家如何去剪自己的结构,注意点在哪里等等。这个剪枝项目是没办法保证所有的结构都能剪,有一定的风险,是否入手请自行考虑! [yolov5v7剪枝](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-improve/yolov5v7-light.md)这里面的结构都经过实验是可剪的. ### 那些人群建议入手剪枝 1. 原始的算法精度很高,没办法再提升精度,只能走轻量化路线,这种建议配合一些轻量化模块+剪枝来增加你的工作量和创新度. 2. 需要部署到嵌入式或者手机端等低算力设备,这类本身模型就不能太复杂,而且以轻量化为主,剪枝是非常适合的. 3. 以后需从事深度学习方面的工作,模型轻量化(蒸馏、量化、剪枝)基本是必须要会的技能. ### Yolov8 相关实验 GPU-Device:RTX3090 #### Dataset:VisDrone 30%TrainingData Model:Yolov8n | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 3,007,598 | 8.1 | 5.9m | 0.225 | 0.124 | 0.00099s | | Lamp Exp1 2.0X | 1,513,245(50.3%) | 4.0(50%) | 3.1m(52.5%) | 0.197(-0.018) | 0.106(-0.018) | 0.00075s(75.8%) | | Lamp Exp2 2.0X | 679,484(22.6%) | 4.0(50%) | 1.5m(25.4%) | 0.231(+0.006) | 0.126(+0.002) | 0.00073s(73.7%) | | Lamp Exp3 2.5X | 503,959(16.8%) | 3.2(39.5%) | 1.2m(20.3%) | 0.225(0.0) | 0.123(-0.001) | 0.00068s(68.7%) | | Group-Taylor Exp1 2.0X | 1,093,305(36.4%) | 4.0(50%) | 2.3m(39%) | 0.203(-0.022) | 0.11(-0.014) | 0.00074s(74.8%) | | Group-Taylor Exp2 2.0X | 1,513,245(50.3%) | 4.0(50%) | 3.1m(52.5%) | 0.196(-0.029) | 0.105(-0.019) | 0.00075s(75.8%) | | Group-Hessian Exp1 2.0X | 1,436,390(47.8%) | 4.0(50%) | 3.0m(50.8%) | 0.168(-0.057) | 0.0883(-0.041) | 0.00071s(71.7%) | | Group-Sl Exp1 2.0X | 1,556,422(51.7%) | 4.0(50%) | 3.1m(52.5%) | 0.173(-0.052) | 0.0901(-0.0339) | 0.00066s(66.7%) | | Group-Slim Exp1 2.0X | 1,113,000(37%) | 4.0(50%) | 2.3m(39%) | 0.201(-0.024) | 0.108(-0.016) | 0.00075s(75.8%) | | Slim Exp1 2.0X | 932,902(31%) | 4.0(50%) | 2.0m(33.9%) | 0.21(-0.015) | 0.114(-0.01) | 0.00075s(75.8%) | #### Dataset:VisDrone 30%TrainingData Model:yolov8-Faster-GFPN-P2-EfficientHead | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 3,457,400 | 12.1 | 7.2M | 0.241 | 0.133 | 0.00188s | | Lamp Exp1 2.0X | 903,894(26.1%) | 5.9(48.6%) | 2.3M(32%) | 0.226(-0.015) | 0.127(-0.006) | 0.00150s(83.3%) | | GroupTaylor Exp1 2.0X | 1,699,046(49.1%) | 5.9(48.6%) | 3.9M(54.2%) | 0.212(-0.029) | 0.115(-0.028) | 0.00142s(75.5%) | | GroupTaylor Exp2 2.0X | 1,751,941(51%) | 6.0(49.6%) | 4.0M(55.6%) | 0.216(-0.025) | 0.119(-0.024) | 0.00147s(78.2%) | | GroupHessian Exp1 2.0X | 1,751,941(51%) | 6.0(49.6%) | 2.3M(32%) | 0.214(-0.023) | 0.118(-0.025) | 0.00147s(78.2%) | #### Dataset:Seaship BaseLine:Yolov8n Light:yolov8-BIFPN-EfficientRepHead.yaml(C2f-EMBC,BIFPN,EfficientRepHead) | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 3,006,818 | 8.1 | 5.9M | 0.986 | 0.813 | 0.00098s | | Light | 1,809,166(60.2%) | 5.6(69.1%) | 4.5M(76.3%) | 0.981(-0.005) | 0.787(-0.026) | 0.00109s(112.2%) | | Light Lamp Exp1 2.0X | 729,717(24.3%) | 2.4(30%) | 2.3M(39%) | 0.981(-0.005) | 0.777(-0.036) | 0.00080s(81.6%) | | Light Lamp Exp2 2.5X | 492,731(16.4%) | 1.6(19.8%) | 1.8M(31%) | 0.973(-0.013) | 0.746(-0.067) | 0.00062s(63.3%) | #### Dataset:VisDrone 100%TrainingData Model:yolov8-ASF-P2 | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 2,490,488 | 12.0 | 5.0M | 0.295 | 0.166 | 0.00199s | | Lamp Exp1 2.0X | 664,162(26.7%) | 5.9(49.2%) | 2.3M(46%) | 0.277(-0.018) | 0.154(-0.012) | 0.00153s(76.9%) | | Lamp Exp2 1.5X | 1,065,363(42.8%) | 7.9(65.8%) | 2.4M(48%) | 0.296(+0.001) | 0.165(-0.001) | 0.00168s(84.4%) | | Lamp Exp3 1.7X | 885,911(35.6%) | 7.0(58.3%) | 2.3M(46%) | 0.29(-0.005) | 0.161(-0.005) | 0.00162s(81.4%) | #### Dataset:VisDrone 30%TrainingData Model:yolov8-GHostHGNetV2-SlimNeck-ASF | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 2,236,610 | 6.8 | 4.6M | 0.206 | 0.111 | 0.00137s | | LAMP Exp1 2.0X | 951,571(42.5%) | 3.4(50%) | 2.1M(45.7%) | 0.207(+0.001) | 0.112(+0.001) | 0.00092s(67.2%) | #### Dataset:CrowdHuman 20%TrainingData Model:yolov8-convnextv2-goldyolo-ASF | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 8,712,945 | 16.7 | 17.0M | 0.747 | 0.431 | 0.00461s | | LAMP Exp1 2.0X | 4,493,135(51.6%) | 8.3(49.7%) | 9.0M(52.9%) | 0.747(0.0) | 0.434(+0.003) | 0.00261s(56.6%) | | LAMP Exp2 2.5X | 3,899,980(44.8%) | 6.6(39.5%) | 7.9M(46.5%) | 0.742(-0.005) | 0.431(0.0) | 0.00219s(47.5%) | #### Dataset:CrowdHuman 20%TrainingData Model:yolov8-DyHead | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 3,485,458 | 9.6 | 6.9M | 0.743 | 0.436 | 0.00173s | | LAMP Exp1 2.0X | 1,167,932(33.5%) | 4.8(50%) | 2.5M(65.8%) | 0.745(+0.002) | 0.439(+0.003) | 0.00124s(71.7%) | | LAMP Exp1 2.5X | 815,035(23.4%) | 3.8(39.6%) | 1.8M(26.1%) | 0.74(-0.003) | 0.432(-0.004) | 0.00106s(61.3%) | | LAMP Exp1 3.0X | 628,561(18%) | 3.2(33.3%) | 1.5M(21.7%) | 0.733(-0.01) | 0.426(-0.01) | 0.00098s(56.6%) | #### Dataset:CrowdHuman 20%TrainingData Model:yolov8-repvit(CVPR2024)-RepNCSPELAN | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 6,288,382 | 17.6 | 12.7M | 0.74 | 0.431 | 0.00220s | | LAMP Exp1 2.0X | 2,300,482(36.6%) | 8.7(49.4%) | 5.0M(39.4%) | 0.747(+0.007) | 0.438(+0.007) | 0.00167s(76%) | | LAMP Exp2 3.0X | 1,536,813(24.4%) | 5.7(32.4%) | 3.6M(28.3%) | 0.732(-0.008) | 0.424(-0.007) | 0.00143s(65%) | | LAMP Exp3 3.5X | 1,328,534(21.1%) | 4.8(27.3%) | 3.2M(25.2%) | 0.73(-0.01) | 0.421(-0.01) | 0.00137s(63%) | | LAMP Exp4 4.0X | 1,179,757(18.8%) | 4.2(24.1%) | 2.9M(22.8%) | 0.738(-0.02) | 0.425(-0.006) | 0.00132s(61%) | | GROUP-TAYLOR Exp1 2.0X | 3,235,020(51.4%) | 8.7(49.4%) | 6.8M(53.5%) | 0.704(-0.036) | 0.396(-0.035) | 0.00154s(70%) | | GROUP-TAYLOR Exp2 2.0X | 3,197,034(50.8%) | 8.7(49.4%) | 6.7M(52.7%) | 0.707(-0.033) | 0.405(-0.026) | 0.00158s(72%) | #### Dataset:WIDER-FACE Model:yolov8n-pose (因此数据集的验证集没有pose标注,所以pose指标都为0) | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 3,078,128 | 8.3 | 6.1M | 0.639 | 0.334 | 0.00102s | | LAMP Exp1 2.0X | 731,605(23.8%) | 4.1(49.3%) | 1.6M(26.2%) | 0.636(-0.003) | 0.333(-0.001) | 0.00080s(78.4%) | #### Dataset:Seaship Model:yolov8-starnet-C2f-Star-LSCD.yaml | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 1,369,689 | 4.5 | 2.8M | 0.992 | 0.815 | 0.00079s | | LAMP Exp1 2.0X | 232,498(17%) | 2.2(49%) | 0.6M(21.4%) | 0.98(-0.012) | 0.791(-0.024) | 0.00047s(59.5%) | | LAMP Exp2 2.5X | 136,375(10%) | 1.8(40%) | 0.5M(17.9%) | 0.965(-0.027) | 0.736(-0.079) | 0.00035s(44.3%) | | LAMP Exp3 3.0X | 98,051(7.2%) | 1.5(33.3%) | 0.4M(14.3%) | 0.912(-0.08) | 0.629(-0.186) | 0.00024s(30.4%) | ### Yolov10 相关实验 GPU-Device:RTX3090 #### Dataset:Visdrone2019 Model:yolov10n.yaml | model | Parameters | GFLOPs | Model Size | mAP50 | mAP50-95 | Inference Time(bs:32) | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | BaseLine | 2,267,118 | 6.5 | 5.5M | 0.271 | 0.151 | 0.00107s | | LAMP Exp1 2.0X | 788,635(34.8%) | 3.5(53.8%) | 2.1M(38.2%) | 0.271(0.0) | 0.148(-0.003) | 0.00084s(78.5%) | | LAMP Exp2 2.5X | 614,698(27.1%) | 2.8(43.1%) | 1.7M(30.9%) | 0.258(-0.013) | 0.14(-0.011) | 0.00077s(72%) | ================================================ FILE: yolo-improve/yolov8-distill.md ================================================ # YOLOV8V10V11蒸馏项目介绍 ## 对于群里的蒸馏相关问题,我基本都会回复,对于一些蒸馏问题,我都会给出建议。 ### 首先蒸馏是什么? 模型蒸馏(Model Distillation)是一种用于在计算机视觉中提高模型性能和效率的技术。在模型蒸馏中,通常存在两个模型,即“教师模型”和“学生模型”。 ### 为什么需要蒸馏? 1. 在不增加模型计算量和参数量的情况下提升精度,也即是可以无损提高精度。 2. 配合剪枝一起使用,可以尽量达到无损降低模型参数量、计算量,提高FPS的情况下,还能保持模型精度没有下降甚至上升,这是改进网络结构无法达到的高度。 3. 论文中的保底手段,因为剪枝和蒸馏的特殊性,其都不会增加参数量和计算量,可以在最后一个点上大幅度增加实验和工作量,因为本身蒸馏也需要做大量实验。 ### 目前蒸馏方法包含: 1. Logical 1. L1 2. L2 3. [BCKD](https://link.zhihu.com/?target=https%3A//arxiv.org//pdf/2308.14286)(Bridging Cross-task Protocol Inconsistency for Distillation in Dense Object Detection,ICCV 2023) 4. Double distillation strategy.(针对yolov10的结构开发) 2. Feature 1. [Mimic](https://openaccess.thecvf.com/content_cvpr_2017/papers/Li_Mimicking_Very_Efficient_CVPR_2017_paper.pdf) 2. [Masked Generative Distillation](https://link.zhihu.com/?target=https%3A//arxiv.org/pdf/2205.01529.pdf) (ECCV 2022) 3. [Channel-wise Distillation](https://arxiv.org/pdf/2011.13256.pdf) (ICCV 2021) 4. [ChSimLoss Distillation](https://openaccess.thecvf.com/content/ICCV2021/html/Liu_Exploring_Inter-Channel_Correlation_for_Diversity-Preserved_Knowledge_Distillation_ICCV_2021_paper.html) (ICCV2021) 5. [SPKDLoss Distillation](https://arxiv.org/pdf/1907.09682.pdf) (ICCV2019) ### 知识蒸馏的一些细节(具体项目会提供视频讲解) 1. Feature蒸馏可以自定义选择层进行蒸馏. 2. 蒸馏损失支持常数,线性,余弦进行动调整. 3. 支持Logical和Feature一起使用. 4. 过程中会输出Logical和Feature的损失,让用户可以及时调整对应的损失系数. 5. 支持正常训练模型时候进行蒸馏和剪枝后finetune蒸馏. 6. 支持自蒸馏. # 实验示例结果.(以下示例实验相关命令,视频教程,实验数据都在项目里面) #### Dataset:VisDrone(训练集只用了百分之30的数据,验证集和测试集用了全量的数据) Teacher:yolov8s Student:yolov8n (no pretrained weight) | model | GFLOPs | mAP50(test set) | mAP50-95(test set) | | :----: | :----: | :----: | :----: | | yolov8n | 8.1 | 0.202 | 0.108 | | yolov8s | 28.5 | 0.234 | 0.128 | | yolov8n CWD Exp1 | 8.1 | 0.211(+0.009) | 0.114(+0.006) | | yolov8n CWD Exp2 | 8.1 | 0.208(+0.006) | 0.112(+0.004) | | yolov8n CWD Exp3 | 8.1 | 0.21(+0.008) | 0.112(+0.004) | | yolov8n Mimic Exp1 | 8.1 | 0.203(+0.001) | 0.108(+0.0) | | yolov8n Mimic Exp2 | 8.1 | 0.204(+0.002) | 0.107(-0.001) | | yolov8n l2 Exp1 | 8.1 | 0.196(-0.006) | 0.106(-0.002) | | yolov8n BCKD Exp1 | 8.1 | 0.208(+0.006) | 0.112(+0.004) | | yolov8n BCKD Exp2 | 8.1 | 0.206(+0.004) | 0.106(-0.002) | | yolov8n BCKD Exp3 | 8.1 | 0.209(+0.007) | 0.113(+0.005) | | yolov8n BCKD Exp4 | 8.1 | 0.204(+0.002) | 0.11(+0.002) | | yolov8n BCKD+CWD Exp1 | 8.1 | 0.204(+0.002) | 0.109(+0.001) | | yolov8n BCKD+CWD Exp2 | 8.1 | 0.214(+0.012) | 0.115(+0.007) | | yolov8n BCKD+CWD Exp3 | 8.1 | 0.21(+0.008) | 0.114(+0.006) | | yolov8n BCKD+CWD Exp4 | 8.1 | 0.208(+0.006) | 0.113(+0.005) | #### Dataset:VisDrone(训练集只用了百分之30的数据,验证集和测试集用了全量的数据) Teacher:yolov8s Student:yolov8n-lamp (use pretrained weight) | model | GFLOPs | mAP50(test set) | mAP50-95(test set) | | :----: | :----: | :----: | :----: | | yolov8n | 8.1 | 0.225 | 0.124 | | yolov8n-lamp | 3.2 | 0.225 | 0.123(-0.001) | | yolov8s | 28.5 | 0.259 | 0.146 | | yolov8n-lamp cwd exp1 | 3.2 | 0.23(+0.005) | 0.124(0.0) | #### Dataset:VisDrone(训练集只用了百分之30的数据,验证集和测试集用了全量的数据) Teacher:yolov8s-asf-p2 Student:yolov8s-asf-p2 | model | GFLOPs | mAP50(test set) | mAP50-95(test set) | | :----: | :----: | :----: | :----: | | yolov8n-asf-p2 | 12.0 | 0.237 | 0.127 | | yolov8s-asf-p2 | 35.8 | 0.282 | 0.155 | | yolov8n-asf-p2 cwd exp1 | 12.0 | 0.24(+0.003) | 0.129(+0.002) | | yolov8n-asf-p2 cwd exp2 | 12.0 | 0.239(+0.002) | 0.128(+0.001) | | yolov8n-asf-p2 cwd exp3 | 12.0 | 0.236(-0.001) | 0.125(-0.002) | | yolov8n-asf-p2 cwd exp4 | 12.0 | 0.239(+0.002) | 0.128(+0.001) | | yolov8n-asf-p2 cwd exp5 | 12.0 | 0.234(-0.004) | 0.125(-0.002) | | yolov8n-asf-p2 mgd exp1 | 12.0 | 0.234(-0.004) | 0.125(-0.002) | | yolov8n-asf-p2 mgd exp2 | 12.0 | 0.238(+0.001) | 0.127(0.0) | | yolov8n-asf-p2 BCKD exp1 | 12.0 | 0.241(+0.004) | 0.131(+0.004) | | yolov8n-asf-p2 BCKD exp2 | 12.0 | 0.24(+0.003) | 0.13(+0.003) | | yolov8n-asf-p2 cwd+BCKD exp1 | 12.0 | 0.241(+0.004) | 0.131(+0.004) | | yolov8n-asf-p2 cwd+BCKD exp2 | 12.0 | 0.239(+0.002) | 0.128(+0.001) | ================================================ FILE: yolo-improve/yolov8-erf.py ================================================ import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') import torch, yaml, cv2, os, shutil, sys, glob import numpy as np np.random.seed(0) import matplotlib.pyplot as plt from tqdm import trange from PIL import Image from ultralytics.nn.tasks import attempt_load_weights from timm.utils import AverageMeter import matplotlib.pyplot as plt plt.rcParams["font.family"] = "Times New Roman" import seaborn as sns def get_activation(feat, backbone_idx=-1): def hook(model, inputs, outputs): if backbone_idx != -1: for _ in range(5 - len(outputs)): outputs.insert(0, None) feat.append(outputs[backbone_idx]) else: feat.append(outputs) return hook def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32): # Resize and pad image while meeting stride-multiple constraints shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if auto: # minimum rectangle dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border return im, ratio, (dw, dh) def get_rectangle(data, thresh): h, w = data.shape all_sum = np.sum(data) for i in range(1, h // 2): selected_area = data[h // 2 - i:h // 2 + 1 + i, w // 2 - i:w // 2 + 1 + i] area_sum = np.sum(selected_area) if area_sum / all_sum > thresh: return i * 2 + 1, (i * 2 + 1) / h * (i * 2 + 1) / w return None def heatmap(data, camp='RdYlGn', figsize=(10, 10.75), ax=None, save_path=None): plt.figure(figsize=figsize, dpi=40) ax = sns.heatmap(data, xticklabels=False, yticklabels=False, cmap=camp, center=0, annot=False, ax=ax, cbar=True, annot_kws={"size": 24}, fmt='.2f') plt.tight_layout() plt.savefig(save_path) class yolov8_erf: feature, hooks = [], [] def __init__(self, weight, device, layer, dataset, num_images, save_path) -> None: device = torch.device(device) ckpt = torch.load(weight) model = attempt_load_weights(weight, device) model.info() for p in model.parameters(): p.requires_grad_(True) model.eval() optimizer = torch.optim.SGD(model.parameters(), lr=0, weight_decay=0) meter = AverageMeter() optimizer.zero_grad() if '-' in layer: layer_first, layer_second = layer.split('-') self.hooks.append(model.model[int(layer_first)].register_forward_hook(get_activation(self.feature, backbone_idx=int(layer_second)))) else: self.hooks.append(model.model[int(layer)].register_forward_hook(get_activation(self.feature))) self.__dict__.update(locals()) def get_input_grad(self, samples): _ = self.model(samples) outputs = self.feature[-1] self.feature.clear() out_size = outputs.size() central_point = torch.nn.functional.relu(outputs[:, :, out_size[2] // 2, out_size[3] // 2]).sum() grad = torch.autograd.grad(central_point, samples) grad = grad[0] grad = torch.nn.functional.relu(grad) aggregated = grad.sum((0, 1)) grad_map = aggregated.cpu().numpy() return grad_map def process(self): for image_path in os.listdir(self.dataset): if self.meter.count == self.num_images: break img = cv2.imread(f'{self.dataset}/{image_path}') img = letterbox(img, auto=False)[0] img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.float32(img) / 255.0 samples = torch.from_numpy(np.transpose(img, axes=[2, 0, 1])).unsqueeze(0).to(self.device) samples.requires_grad = True self.optimizer.zero_grad() contribution_scores = self.get_input_grad(samples) if np.isnan(np.sum(contribution_scores)): print('got NAN, next image') continue else: print(f'{self.meter.count}/{self.num_images} calculate....') self.meter.update(contribution_scores) # Set figure parameters large = 24; med = 24; small = 24 params = {'axes.titlesize': large, 'legend.fontsize': med, 'figure.figsize': (16, 10), 'axes.labelsize': med, 'xtick.labelsize': med, 'ytick.labelsize': med, 'figure.titlesize': large} plt.rcParams.update(params) plt.style.use('seaborn-whitegrid') sns.set_style("white") plt.rc('font', **{'family': 'Times New Roman'}) plt.rcParams['axes.unicode_minus'] = False data = self.meter.avg print(f'max value:{np.max(data):.3f} min value:{np.min(data):.3f}') data = np.log10(data + 1) # the scores differ in magnitude. take the logarithm for better readability data = data / np.max(data) # rescale to [0,1] for the comparability among models print('======================= the high-contribution area ratio =====================') for thresh in [0.2, 0.3, 0.5, 0.99]: side_length, area_ratio = get_rectangle(data, thresh) print('thresh, rectangle side length, area ratio: ', thresh, side_length, area_ratio) heatmap(data, save_path=self.save_path) def get_params(): params = { 'weight': 'yolov8n.pt', # 只需要指定权重即可 'device': 'cuda:0', 'layer': '10', # string 'dataset': '', 'num_images': 50, 'save_path': 'result.png' } return params if __name__ == '__main__': cfg = get_params() yolov8_erf(**cfg).process() ================================================ FILE: yolo-improve/yolov8-objectcount.py ================================================ import warnings warnings.filterwarnings('ignore') import cv2, os, shutil import numpy as np from ultralytics import YOLO def get_video_cfg(path): video = cv2.VideoCapture(path) size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))) fps = int(video.get(cv2.CAP_PROP_FPS)) return cv2.VideoWriter_fourcc(*'XVID'), size, fps def plot_and_counting(result): image_plot = result.plot() box_count = result.boxes.shape[0] cv2.putText(image_plot, f'Object Counts:{box_count}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 255), 4) return image_plot if __name__ == '__main__': output_dir = 'result' if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir, exist_ok=True) model = YOLO('yolov8n.pt') # select your model.pt path # ----------------------for images or images-folder---------------------- for result in model.predict(source='ultralytics/assets', stream=True, imgsz=640, save=False, # conf=0.2, ): image_plot = plot_and_counting(result) cv2.imwrite(f'{output_dir}/{os.path.basename(result.path)}', image_plot) # ----------------------for video-folder---------------------- # video_base_path = 'video' # for video_path in os.listdir(video_base_path): # fourcc, size, fps = get_video_cfg(f'{video_base_path}/{video_path}') # video_output = cv2.VideoWriter(f'{output_dir}/{video_path}', fourcc, fps, size) # for result in model.predict(source=f'{video_base_path}/{video_path}', # stream=True, # imgsz=640, # save=False, # # conf=0.2, # ): # image_plot = plot_and_counting(result) # video_output.write(image_plot) # video_output.release() ================================================ FILE: yolo-improve/yolov8-track.py ================================================ import warnings warnings.filterwarnings('ignore') import cv2, os, shutil import numpy as np from pathlib import Path from ultralytics import YOLO from boxmot import DeepOCSORT, BYTETracker, BoTSORT, StrongSORT, OCSORT, HybridSORT def get_video_cfg(path): video = cv2.VideoCapture(path) size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))) fps = int(video.get(cv2.CAP_PROP_FPS)) return cv2.VideoWriter_fourcc(*'XVID'), size, fps def counting(image_plot, result): box_count = result.boxes.shape[0] cv2.putText(image_plot, f'Object Counts:{box_count}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 255), 4) return image_plot def transform_mot(result): mot_result = [] for i in range(result.boxes.shape[0]): mot_result.append(result.boxes.xyxy[i].cpu().detach().cpu().numpy().tolist() + [float(result.boxes.conf[i]), float(result.boxes.cls[i])]) return np.array(mot_result) # boxmot 10.0.57 if __name__ == '__main__': output_dir = 'result' if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir, exist_ok=True) model = YOLO('runs/train/yolov8m-crowdhuman/weights/best.pt') # select your model.pt path video_base_path = 'video' for video_path in os.listdir(video_base_path): tracker = DeepOCSORT( model_weights=Path('osnet_x1_0_msmt17_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pt'), # which ReID model to use device='cuda:0', fp16=False, ) # tracker = BoTSORT( # model_weights=Path('osnet_x1_0_msmt17_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pt'), # which ReID model to use # device='cuda:0', # fp16=False, # ) # tracker = StrongSORT( # model_weights=Path('osnet_x1_0_msmt17_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pt'), # which ReID model to use # device='cuda:0', # fp16=False, # ) # tracker = HybridSORT( # reid_weights=Path('osnet_x1_0_msmt17_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pt'), # which ReID model to use # device='cuda:0', # half=False, # det_thresh=0.3, # ) # tracker = BYTETracker() # tracker = OCSORT() fourcc, size, fps = get_video_cfg(f'{video_base_path}/{video_path}') video_output = cv2.VideoWriter(f'{output_dir}/{video_path}', fourcc, fps, size) for result in model.predict(source=f'{video_base_path}/{video_path}', stream=True, imgsz=640, save=False, # conf=0.2, classes=1 ): image_plot = result.orig_img mot_input = transform_mot(result) try: tracker.update(mot_input, image_plot) tracker.plot_results(image_plot, show_trajectories=True) except: continue counting(image_plot, result) video_output.write(image_plot) video_output.release() ================================================ FILE: yolo-improve/yolov8.py ================================================ from ultralytics import YOLO # 安装命令 # python setup.py develop # 数据集示例百度云链接 # 链接:https://pan.baidu.com/s/19FM7XnKEFC83vpiRdtNA8A?pwd=n93i # 提取码:n93i if __name__ == '__main__': # 直接使用预训练模型创建模型. model = YOLO('yolov8n.pt') model.train(**{'cfg':'ultralytics/cfg/exp1.yaml', 'data':'dataset/data.yaml'}) # 使用yaml配置文件来创建模型,并导入预训练权重. model = YOLO('ultralytics/cfg/models/v8/yolov8.yaml') model.load('yolov8n.pt') model.train(**{'cfg':'ultralytics/cfg/exp1.yaml', 'data':'dataset/data.yaml'}) # 模型验证 model = YOLO('runs/detect/yolov8n_exp/weights/best.pt') model.val(**{'data':'dataset/data.yaml'}) # 模型推理 model = YOLO('runs/detect/yolov8n_exp/weights/best.pt') model.predict(source='dataset/images/test', **{'save':True}) ================================================ FILE: yolo-improve/yolov8v10-project.md ================================================ # [基于Ultralytics的YOLOV8V10改进项目.(69.9¥)](https://github.com/z1069614715/objectdetection_script) # 目前自带的一些改进方案(目前拥有合计300+个改进点!持续更新!) # 为了感谢各位对本项目的支持,本项目的赠品是yolov5-PAGCP通道剪枝算法.[具体使用教程](https://www.bilibili.com/video/BV1yh4y1Z7vz/) # 专栏改进汇总 ## YOLOV8系列 ### 二次创新系列 1. ultralytics/cfg/models/v8/yolov8-RevCol.yaml 使用(ICLR2023)Reversible Column Networks对yolov8主干进行重设计,里面的支持更换不同的C2f-Block. 2. EMASlideLoss 使用EMA思想与SlideLoss进行相结合. 3. ultralytics/cfg/models/v8/yolov8-dyhead-DCNV3.yaml 使用[DCNV3](https://github.com/OpenGVLab/InternImage)替换DyHead中的DCNV2. 4. ultralytics/cfg/models/v8/yolov8-C2f-EMBC.yaml 使用[Efficientnet](https://blog.csdn.net/weixin_43334693/article/details/131114618?spm=1001.2014.3001.5501)中的MBConv与EffectiveSE改进C2f. 5. ultralytics/cfg/models/v8/yolov8-GhostHGNetV2.yaml 使用Ghost_HGNetV2作为YOLOV8的backbone. 6. ultralytics/cfg/models/v8/yolov8-RepHGNetV2.yaml 使用Rep_HGNetV2作为YOLOV8的backbone. 7. ultralytics/cfg/models/v8/yolov8-C2f-DWR-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)的模块进行二次创新后改进C2f. 8. ultralytics/cfg/models/v8/yolov8-ASF-P2.yaml 在ultralytics/cfg/models/v8/yolov8-ASF.yaml的基础上进行二次创新,引入P2检测层并对网络结构进行优化. 9. ultralytics/cfg/models/v8/yolov8-CSP-EDLAN.yaml 使用[DualConv](https://github.com/ChipsGuardian/DualConv)打造CSP Efficient Dual Layer Aggregation Networks改进yolov8. 10. ultralytics/cfg/models/v8/yolov8-bifpn-SDI.yaml 使用[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对BIFPN进行二次创新. 11. ultralytics/cfg/models/v8/yolov8-goldyolo-asf.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute与[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行二次创新改进yolov8的neck. 12. ultralytics/cfg/models/v8/yolov8-dyhead-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)对DyHead进行二次创新.(请关闭AMP进行训练,使用教程请看20240116版本更新说明) 13. ultralytics/cfg/models/v8/yolov8-HSPAN.yaml 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进yolov8的neck. 14. ultralytics/cfg/models/v8/yolov8-GDFPN.yaml 使用[DAMO-YOLO](https://github.com/tinyvision/DAMO-YOLO)中的RepGFPN与[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)进行二次创新改进Neck. 15. ultralytics/cfg/models/v8/yolov8-HSPAN-DySample.yaml 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN再进行创新,使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进其上采样模块. 16. ultralytics/cfg/models/v8/yolov8-ASF-DySample.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion与[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)组合得到Dynamic Sample Attentional Scale Sequence Fusion. 17. ultralytics/cfg/models/v8/yolov8-C2f-DCNV2-Dynamic.yaml 利用自研注意力机制MPCA强化DCNV2中的offset和mask. 18. ultralytics/cfg/models/v8/yolov8-C2f-iRMB-Cascaded.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)中的CascadedGroupAttention对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C2f. 19. ultralytics/cfg/models/v8/yolov8-C2f-iRMB-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C2f. 20. ultralytics/cfg/models/v8/yolov8-C2f-iRMB-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C2f. 21. ultralytics/cfg/models/v8/yolov8-DBBNCSPELAN.yaml 使用[Diverse Branch Block CVPR2021](https://arxiv.org/abs/2103.13425)对[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行二次创新后改进yolov8. 22. ultralytics/cfg/models/v8/yolov8-OREPANCSPELAN.yaml 使用[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main)对[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行二次创新后改进yolov8. 23. ultralytics/cfg/models/v8/yolov8-DRBNCSPELAN.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行二次创新后改进yolov8. 24. ultralytics/cfg/models/v8/yolov8-DynamicHGNetV2.yaml 使用[CVPR2024 parameternet](https://arxiv.org/pdf/2306.14525v2.pdf)中的DynamicConv对[CVPR2024 RTDETR](https://arxiv.org/abs/2304.08069)中的HGBlokc进行二次创新. 25. ultralytics/cfg/models/v8/yolov8-C2f-RVB-EMA.yaml 使用[CVPR2024 RepViT](https://github.com/THU-MIG/RepViT/tree/main)中的RepViTBlock和EMA注意力机制改进C2f. 26. ultralytics/cfg/models/v8/yolov8-ELA-HSFPN.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN. 27. ultralytics/cfg/models/v8/yolov8-CA-HSFPN.yaml 使用[Coordinate Attention CVPR2021](https://github.com/houqb/CoordAttention)改进HSFPN. 28. ultralytics/cfg/models/v8/yolov8-CAA-HSFPN.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块HSFPN. 29. ultralytics/cfg/models/v8/yolov8-CSMHSA.yaml 对Mutil-Head Self-Attention进行创新得到Cross-Scale Mutil-Head Self-Attention. 1. 由于高维通常包含更高级别的语义信息,而低维包含更多细节信息,因此高维信息作为query,而低维信息作为key和Value,将两者结合起来可以利用高维的特征帮助低维的特征进行精细过滤,可以实现更全面和丰富的特征表达。 2. 通过使用高维的上采样信息进行Query操作,可以更好地捕捉到目标的全局信息,从而有助于增强模型对目标的识别和定位能力。 30. ultralytics/cfg/models/v8/yolov8-CAFMFusion.yaml 利用具有[HCANet](https://github.com/summitgao/HCANet)中的CAFM,其具有获取全局和局部信息的注意力机制进行二次改进content-guided attention fusion. 31. ultralytics/cfg/models/v8/yolov8-C2f-Faster-CGLU.yaml 使用[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU对CVPR2023中的FasterNet进行二次创新. 32. ultralytics/cfg/models/v8/yolov8-C2f-Star-CAA.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock和[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA改进C2f. 33. ultralytics/cfg/models/v8/yolov8-bifpn-GLSA.yaml 使用[GLSA](https://github.com/Barrett-python/DuAT)模块对bifpn进行二次创新. 34. ultralytics/cfg/models/v8/yolov8-BIMAFPN.yaml 利用BIFPN的思想对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次改进得到BIMAFPN. 35. ultralytics/cfg/models/v8/yolov8-C2f-AdditiveBlock-CGLU.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进c2f. 36. ultralytics/cfg/models/v8/yolov8-C2f-MSMHSA-CGLU.yaml 使用[CMTFNet](https://github.com/DrWuHonglin/CMTFNet/tree/main)中的M2SA和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进c2f. 37. ultralytics/cfg/models/v8/yolov8-C2f-IdentityFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的IdentityFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 38. ultralytics/cfg/models/v8/yolov8-C2f-RandomMixing-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的RandomMixing和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 39. ultralytics/cfg/models/v8/yolov8-C2f-PoolingFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的PoolingFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 40. ultralytics/cfg/models/v8/yolov8-C2f-ConvFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的ConvFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 41. ultralytics/cfg/models/v8/yolov8-C2f-CaFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的CaFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 42. ultralytics/cfg/models/v8/yolov8-MAN-Faster.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新改进yolov8. 43. ultralytics/cfg/models/v8/yolov8-MAN-FasterCGLU.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进yolov8. 44. ultralytics/cfg/models/v8/yolov8-MAN-Star.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock进行二次创新改进yolov8. 45. ultralytics/cfg/models/v8/yolov8-MutilBackbone-MSGA.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate对自研系列MutilBackbone再次创新. 46. ultralytics/cfg/models/v8/yolov8-slimneck-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade对slimneck二次创新. 47. ultralytics/cfg/models/v8/yolov8-MAN-FasterCGLU-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade和[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进yolov8. 48. ultralytics/cfg/models/v8/yolov8-CDFA.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的WaveletConv与[AAAI2025 ConDSeg](https://github.com/Mengqi-Lei/ConDSeg)的ContrastDrivenFeatureAggregation结合改进yolov8. 49. ultralytics/cfg/models/v8/yolov8-C2f-StripCGLU.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2f. 50. ultralytics/cfg/models/v8/yolov8-C2f-Faster-KAN.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN对(CVPR2023)fasternet中的FastetBlock进行二次创新. 51. ultralytics/cfg/models/v8/yolov8-C2f-DIMB-KAN.yaml 在yolov8-C2f-DIMB.yaml的基础上把mlp模块换成[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN. 52. Localization Quality Estimation - Lightweight Shared Convolutional Detection Head Localization Quality Estimation模块出自[GFocalV2](https://arxiv.org/abs/2011.12885). detect:ultralytics/cfg/models/v8/yolov8-LSCD-LQE.yaml seg:ultralytics/cfg/models/v8/yolov8-seg-LSCD-LQE.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-LSCD-LQE.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-LSCD-LQE.yaml 53. ultralytics/cfg/models/v8/yolov8-C2f-EfficientVIM-CGLU.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2f. 54. ultralytics/cfg/models/v8/yolov8-EUCB-SC.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB和[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix改进yolov8的上采样. 55. ultralytics/cfg/models/v8/yolov8-EMBSFPN-SC.yaml 在ultralytics/cfg/models/v8/yolov8-EMBSFPN.yaml方案上引入[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix. 56. ultralytics/cfg/models/v8/yolov8-MFMMAFPN.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次创新. 57. ultralytics/cfg/models/v8/yolov8-MBSMFFPN.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对yolov8-EMBSFPN.yaml再次创新 Multi-Branch&Scale Modulation-Fusion FPN. 58. ultralytics/cfg/models/v8/yolov8-C2f-mambaout-LSConv.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C2f. 59. ultralytics/cfg/models/v8/yolov8-SOEP-RFPN-MFM.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE和[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对原创改进SOEP再次创新. 60. ultralytics/cfg/models/v8/yolov8-SOEP-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer对SOEP进行二次创新. 61. ultralytics/cfg/models/v8/yolov8-MAN-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Mixed Aggregation Network. ### 自研系列 1. ultralytics/cfg/models/v8/yolov8-LAWDS.yaml Light Adaptive-weight downsampling.自研模块,具体讲解请看百度云链接中的视频. 2. ultralytics/cfg/models/v8/yolov8-C2f-EMSC.yaml Efficient Multi-Scale Conv.自研模块,具体讲解请看百度云链接中的视频. 3. ultralytics/cfg/models/v8/yolov8-C2f-EMSCP.yaml Efficient Multi-Scale Conv Plus.自研模块,具体讲解请看百度云链接中的视频. 4. Lightweight Shared Convolutional Detection Head 自研轻量化检测头. detect:ultralytics/cfg/models/v8/yolov8-LSCD.yaml seg:ultralytics/cfg/models/v8/yolov8-seg-LSCD.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-LSCD.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-LSCD.yaml 1. GroupNorm在FOCS论文中已经证实可以提升检测头定位和分类的性能. 2. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上. 3. 在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 综合以上,我们可以让检测头做到参数量更少、计算量更少的情况下,尽可能减少精度的损失. 5. Task Align Dynamic Detection Head 自研任务对齐动态检测头. detect:ultralytics/cfg/models/v8/yolov8-TADDH.yaml seg:ultralytics/cfg/models/v8/yolov8-seg-TADDH.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-TADDH.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-TADDH.yaml 1. GroupNorm在FCOS论文中已经证实可以提升检测头定位和分类的性能. 2. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上.并且在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 3. 参照TOOD的思想,除了标签分配策略上的任务对齐,我们也在检测头上进行定制任务对齐的结构,现有的目标检测器头部通常使用独立的分类和定位分支,这会导致两个任务之间缺乏交互,TADDH通过特征提取器从多个卷积层中学习任务交互特征,得到联合特征,定位分支使用DCNV2和交互特征生成DCNV2的offset和mask,分类分支使用交互特征进行动态特征选择. 6. ultralytics/cfg/models/v8/yolov8-FDPN.yaml 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 1. 通过定制的特征聚焦模块与特征扩散机制,能让每个尺度的特征都具有详细的上下文信息,更有利于后续目标的检测与分类。 2. 定制的特征聚焦模块可以接受三个尺度的输入,其内部包含一个Inception-Style的模块,其利用一组并行深度卷积来捕获丰富的跨多个尺度的信息。 3. 通过扩散机制使具有丰富的上下文信息的特征进行扩散到各个检测尺度. 7. ultralytics/cfg/models/v8/yolov8-FDPN-DASI.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Dimension-Aware Selective Integration Module对自研的Focusing Diffusion Pyramid Network再次创新. 8. ultralytics/cfg/models/v8/yolov8-RGCSPELAN.yaml 自研RepGhostCSPELAN. 1. 参考GhostNet中的思想(主流CNN计算的中间特征映射存在广泛的冗余),采用廉价的操作生成一部分冗余特征图,以此来降低计算量和参数量。 2. 舍弃yolov5与yolov8中常用的BottleNeck,为了弥补舍弃残差块所带来的性能损失,在梯度流通分支上使用RepConv,以此来增强特征提取和梯度流通的能力,并且RepConv可以在推理的时候进行融合,一举两得。 3. 可以通过缩放因子控制RGCSPELAN的大小,使其可以兼顾小模型和大模型。 9. Lightweight Shared Convolutional Separamter BN Detection Head 基于自研轻量化检测头上,参考NASFPN的设计思路把GN换成BN,并且BN层参数不共享. detect:ultralytics/cfg/models/v8/yolov8-LSCSBD.yaml seg:ultralytics/cfg/models/v8/yolov8-seg-LSCSBD.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-LSCSBD.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-LSCSBD.yaml 1. 由于不同层级之间特征的统计量仍存在差异,Normalization layer依然是必须的,由于直接在共享参数的检测头中引入BN会导致其滑动平均值产生误差,而引入 GN 又会增加推理时的开销,因此我们参考NASFPN的做法,让检测头共享卷积层,而BN则分别独立计算。 10. ultralytics/cfg/models/v8/yolov8-EIEStem.yaml 1. 通过SobelConv分支,可以提取图像的边缘信息。由于Sobel滤波器可以检测图像中强度的突然变化,因此可以很好地捕捉图像的边缘特征。这些边缘特征在许多计算机视觉任务中都非常重要,例如图像分割和物体检测。 2. EIEStem模块还结合空间信息,除了边缘信息,EIEStem还通过池化分支提取空间信息,保留重要的空间信息。结合边缘信息和空间信息,可以帮助模型更好地理解图像内容。 3. 通过3D组卷积高效实现Sobel算子。 11. ultralytics/cfg/models/v8/yolov8-C2f-EIEM.yaml 提出了一种新的EIEStem模块,旨在作为图像识别任务中的高效前端模块。该模块结合了提取边缘信息的SobelConv分支和提取空间信息的卷积分支,能够学习到更加丰富的图像特征表示。 1. 边缘信息学习: 卷积神经网络 (CNN)通常擅长学习空间信息,但是对于提取图像中的边缘信息可能稍显不足。EIEStem 模块通过SobelConv分支,显式地提取图像的边缘特征。Sobel滤波器是一种经典的边缘检测滤波器,可以有效地捕捉图像中强度的突然变化,从而获得重要的边缘信息。 2. 空间信息保留: 除了边缘信息,图像中的空间信息也同样重要。EIEStem模块通过一个额外的卷积分支 (conv_branch) 来提取空间信息。与SobelCon 分支不同,conv_branch提取的是原始图像的特征,可以保留丰富的空间细节。 3. 特征融合: EIEStem模块将来自SobelConv分支和conv_branch提取的特征进行融合 (concatenate)。 这种融合操作使得学习到的特征表示既包含了丰富的边缘信息,又包含了空间信息,能够更加全面地刻画图像内容。 12. ultralytics/cfg/models/v8/yolov8-ContextGuideFPN.yaml Context Guide Fusion Module(CGFM)是一个创新的特征融合模块,旨在改进YOLOv8中的特征金字塔网络(FPN)。该模块的设计考虑了多尺度特征融合过程中上下文信息的引导和自适应调整。 1. 上下文信息的有效融合:通过SE注意力机制,模块能够在特征融合过程中捕捉并利用重要的上下文信息,从而增强特征表示的有效性,并有效引导模型学习检测目标的信息,从而提高模型的检测精度。 2. 特征增强:通过权重化的特征重组操作,模块能够增强重要特征,同时抑制不重要特征,提升特征图的判别能力。 3. 简单高效:模块结构相对简单,不会引入过多的计算开销,适合在实时目标检测任务中应用。 这期视频讲解在B站:https://www.bilibili.com/video/BV1Vx4y1n7hZ/ 13. ultralytics/cfg/models/v8/yolov8-LSDECD.yaml 基于自研轻量化检测头上(LSCD),使用detail-enhanced convolution进一步改进,提高检测头的细节捕获能力,进一步改善检测精度. detect:ultralytics/cfg/models/v8/yolov8-LSDECD.yaml segment:ultralytics/cfg/models/v8/yolov8-seg-LSDECD.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-LSDECD.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-LSDECD.yaml 1. DEA-Net中设计了一个细节增强卷积(DEConv),具体来说DEConv将先验信息整合到普通卷积层,以增强表征和泛化能力。然后,通过使用重参数化技术,DEConv等效地转换为普通卷积,不需要额外的参数和计算成本。 14. ultralytics/cfg/models/v8/yolov8-C2f-SMPCGLU.yaml Self-moving Point Convolutional GLU模型改进C2f. SMP来源于[CVPR2023-SMPConv](https://github.com/sangnekim/SMPConv),Convolutional GLU来源于[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt). 1. 普通的卷积在面对数据中的多样性和复杂性时,可能无法捕捉到有效的特征,因此我们采用了SMPConv,其具备最新的自适应点移动机制,从而更好地捕捉局部特征,提高特征提取的灵活性和准确性。 2. 在SMPConv后添加CGLU,Convolutional GLU 结合了卷积和门控机制,能够选择性地通过信息通道,提高了特征提取的有效性和灵活性。 15. Re-CalibrationFPN 为了加强浅层和深层特征的相互交互能力,推出重校准特征金字塔网络(Re-CalibrationFPN). P2345:ultralytics/cfg/models/v8/yolov8-ReCalibrationFPN-P2345.yaml(带有小目标检测头的ReCalibrationFPN) P345:ultralytics/cfg/models/v8/yolov8-ReCalibrationFPN-P345.yaml P3456:ultralytics/cfg/models/v8/yolov8-ReCalibrationFPN-P3456.yaml(带有大目标检测头的ReCalibrationFPN) 1. 浅层语义较少,但细节丰富,有更明显的边界和减少失真。此外,深层蕴藏着丰富的物质语义信息。因此,直接融合低级具有高级特性的特性可能导致冗余和不一致。为了解决这个问题,我们提出了SBA模块,它有选择地聚合边界信息和语义信息来描绘更细粒度的物体轮廓和重新校准物体的位置。 2. 相比传统的FPN结构,SBA模块引入了高分辨率和低分辨率特征之间的双向融合机制,使得特征之间的信息传递更加充分,进一步提升了多尺度特征融合的效果。 3. SBA模块通过自适应的注意力机制,根据特征图的不同分辨率和内容,自适应地调整特征的权重,从而更好地捕捉目标的多尺度特征。 16. ultralytics/cfg/models/v8/yolov8-CSP-PTB.yaml Cross Stage Partial - Partially Transformer Block 在计算机视觉任务中,Transformer结构因其强大的全局特征提取能力而受到广泛关注。然而,由于Transformer结构的计算复杂度较高,直接将其应用于所有通道会导致显著的计算开销。为了在保证高效特征提取的同时降低计算成本,我们设计了一种混合结构,将输入特征图分为两部分,分别由CNN和Transformer处理,结合了卷积神经网络(CNN)和Transformer机制的模块,旨在增强特征提取的能力。 我们提出了一种名为CSP_PTB(Cross Stage Partial - Partially Transformer Block)的模块,旨在结合CNN和Transformer的优势,通过对输入通道进行部分分配来优化计算效率和特征提取能力。 1. 融合局部和全局特征:多项研究表明,CNN的感受野大小较少,导致其只能提取局部特征,但Transformer的MHSA能够提取全局特征,能够同时利用两者的优势。 2. 保证高效特征提取的同时降低计算成本:为了能引入Transformer结构来提取全局特征又不想大幅度增加计算复杂度,因此提出Partially Transformer Block,只对部分通道使用TransformerBlock。 3. MHSA_CGLU包含Mutil-Head-Self-Attention和[ConvolutionalGLU(TransNext CVPR2024)](https://github.com/DaiShiResearch/TransNeXt),其中Mutil-Head-Self-Attention负责提取全局特征,ConvolutionalGLU用于增强非线性特征表达能力,ConvolutionalGLU相比于传统的FFN,具有更强的性能。 4. 可以根据不同的模型大小和具体的运行情况调节用于Transformer的通道数。 17. ultralytics/cfg/models/v8/yolov8-SOEP.yaml 小目标在正常的P3、P4、P5检测层上略显吃力,比较传统的做法是加上P2检测层来提升小目标的检测能力,但是同时也会带来一系列的问题,例如加上P2检测层后计算量过大、后处理更加耗时等问题,日益激发需要开发新的针对小目标有效的特征金字塔,我们基于原本的PAFPN上进行改进,提出SmallObjectEnhancePyramid,相对于传统的添加P2检测层,我们使用P2特征层经过SPDConv得到富含小目标信息的特征给到P3进行融合,然后使用CSP思想和基于[AAAI2024的OmniKernel](https://ojs.aaai.org/index.php/AAAI/article/view/27907)进行改进得到CSP-OmniKernel进行特征整合,OmniKernel模块由三个分支组成,即三个分支,即全局分支、大分支和局部分支、以有效地学习从全局到局部的特征表征,最终从而提高小目标的检测性能。(该模块需要在train.py中关闭amp、且在ultralytics/engine/validator.py 115行附近的self.args.half设置为False、跑其余改进记得修改回去!) 出现这个报错的:RuntimeError: cuFFT error: CUFFT_INTERNAL_ERROR,如果你是40系显卡,需要更新torch大于2.0,并且cuda大于12.0. 18. ultralytics/cfg/models/v8/yolov8-CGRFPN.yaml Context-Guided Spatial Feature Reconstruction Feature Pyramid Network. 1. 借鉴[ECCV2024-CGRSeg](https://github.com/nizhenliang/CGRSeg)中的Rectangular Self-Calibration Module经过精心设计,用于空间特征重建和金字塔上下文提取,它在水平和垂直方向上捕获全局上下文,并获得轴向全局上下文来显式地建模矩形关键区域. 2. PyramidContextExtraction Module使用金字塔上下文提取模块(PyramidContextExtraction),有效整合不同层级的特征信息,提升模型的上下文感知能力。 3. FuseBlockMulti 和 DynamicInterpolationFusion 这些模块用于多尺度特征的融合,通过动态插值和多特征融合,进一步提高了模型的多尺度特征表示能力和提升模型对复杂背景下目标的识别能力。 19. ultralytics/cfg/models/v8/yolov8-FeaturePyramidSharedConv.yaml 1. 多尺度特征提取 通过使用不同膨胀率的卷积层,模块能够提取不同尺度的特征。这对捕捉图像中不同大小和不同上下文的信息非常有利。 低膨胀率捕捉局部细节,高膨胀率捕捉全局上下文。 2. 参数共享 使用共享的卷积层 self.share_conv,大大减少了需要训练的参数数量。相比于每个膨胀率使用独立的卷积层,共享卷积层能够减少冗余,提升模型效率。 减少了模型的存储和计算开销,提升了计算效率。 3. 高效的通道变换 通过1x1卷积层 self.cv1 和 self.cv2,模块能够高效地调整通道数,并进行特征融合。1x1卷积层在减少参数量的同时还能保留重要的特征信息。 4. 更细粒度的特征提取 FeaturePyramidSharedConv 使用卷积操作进行特征提取,能够捕捉更加细粒度的特征。相比之下,SPPF 的池化操作可能会丢失一些细节信息。 卷积操作在特征提取时具有更高的灵活性和表达能力,可以更好地捕捉图像中的细节和复杂模式。 20. APT(Adaptive Power Transformation)-TAL. 为了使不同gt预测对的匹配质量和损失权重更具鉴别性,我们通过自定义的PowerTransformer显著增强高质量预测框的权重,抑制低质量预测框的影响,并使模型在学习的过程可以更关注质量高的预测框。 21. ultralytics/cfg/models/v8/yolov8-EMBSFPN.yaml 基于BIFPN、[MAF-YOLO](https://arxiv.org/pdf/2407.04381)、[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)提出全新的Efficient Multi-Branch&Scale FPN. Efficient Multi-Branch&Scale FPN拥有<轻量化>、<多尺度特征加权融合>、<多尺度高效卷积模块>、<高效上采样模块>、<全局异构核选择机制>。 1. 具有多尺度高效卷积模块和全局异构核选择机制,Trident网络的研究表明,具有较大感受野的网络更适合检测较大的物体,反之,较小尺度的目标则从较小的感受野中受益,因此我们在FPN阶段,对于不同尺度的特征层选择不同的多尺度卷积核以适应并逐步获得多尺度感知场信息。 2. 借鉴BIFPN中的多尺度特征加权融合,能把Concat换成Add来减少参数量和计算量的情况下,还能通过不同尺度特征的重要性进行自适用选择加权融合。 3. 高效上采样模块来源于CVPR2024-EMCAD中的EUCB,能够在保证一定效果的同时保持高效性。 22. ultralytics/cfg/models/v8/yolov8-CSP-PMSFA.yaml 自研模块:CSP-Partial Multi-Scale Feature Aggregation. 1. 部分多尺度特征提取:参考CVPR2020-GhostNet、CVPR2024-FasterNet的思想,采用高效的PartialConv,该模块能够从输入中提取多种尺度的特征信息,但它并不是在所有通道上进行这种操作,而是部分(Partial)地进行,从而提高了计算效率。 2. 增强的特征融合: 最后的 1x1 卷积层通过将不同尺度的特征融合在一起,同时使用残差连接将输入特征与处理后的特征相加,有效保留了原始信息并引入了新的多尺度信息,从而提高模型的表达能力。 23. ultralytics/cfg/models/v8/yolov8-MutilBackbone-DAF.yaml 自研MutilBackbone-DynamicAlignFusion. 1. 为了避免在浅层特征图上消耗过多计算资源,设计的MutilBackbone共享一个stem的信息,这个设计有利于避免计算量过大,推理时间过大的问题。 2. 为了避免不同Backbone信息融合出现不同来源特征之间的空间差异,我们为此设计了DynamicAlignFusion,其先通过融合来自两个不同模块学习到的特征,然后生成一个名为DynamicAlignWeight去调整各自的特征,最后使用一个可学习的通道权重,其可以根据输入特征动态调整两条路径的权重,从而增强模型对不同特征的适应能力。 24. Rep Shared Convolutional Detection Head 自研重参数轻量化检测头. detect:ultralytics/cfg/models/v8/yolov8-RSCD.yaml seg:ultralytics/cfg/models/v8/yolov8-seg-RSCD.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-RSCD.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-RSCD.yaml 1. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上.但由于共享参数可能限制模型的表达能力,因为不同特征可能需要不同的卷积核来捕捉复杂的模式。共享参数可能无法充分捕捉这些差异。为了尽量弥补实现轻量化所采取的共享卷积带来的负面影响,我们使用可重参数化卷积,通过引入更多的可学习参数,网络可以更有效地从数据中提取特征,进而弥补轻量化模型后可能带来的精度丢失问题,并且重参数化卷积可以大大提升参数利用率,并且在推理阶段与普通卷积无差,为模型带来无损的优化方案。 2. 在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 25. ultralytics/cfg/models/v8/yolov8-CSP-FreqSpatial.yaml FreqSpatial 是一个融合时域和频域特征的卷积神经网络(CNN)模块。该模块通过在时域和频域中提取特征,旨在捕捉不同层次的空间和频率信息,以增强模型在处理图像数据时的鲁棒性和表示能力。模块的主要特点是将 Scharr 算子(用于边缘检测)与 时域卷积 和 频域卷积 结合,通过多种视角捕获图像的结构特征。 1. 时域特征提取:从原始图像中提取出基于空间结构的特征,主要捕捉图像的细节、边缘信息等。 2. 频域特征提取:从频率域中提取出频率相关的模式,捕捉到图像的低频和高频成分,能够帮助模型在全局和局部的尺度上提取信息。 3. 特征融合:将时域和频域的特征进行加权相加,得到最终的输出特征图。这种加权融合允许模型同时考虑空间结构信息和频率信息,从而增强模型在多种场景下的表现能力。 26. ultralytics/cfg/models/v8/yolov8-C2f-MutilScaleEdgeInformationSelect.yaml 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新. 我们提出了一个 多尺度边缘信息选择模块(MutilScaleEdgeInformationSelect),其目的是从多尺度边缘信息中高效选择与目标任务高度相关的关键特征。为了实现这一目标,我们引入了一个具有通过聚焦更重要的区域能力的注意力机制[ICCV2023 DualDomainSelectionMechanism, DSM](https://github.com/c-yn/FocalNet)。该机制通过聚焦图像中更重要的区域(如复杂边缘和高频信号区域),在多尺度特征中自适应地筛选具有更高任务相关性的特征,从而显著提升了特征选择的精准度和整体模型性能。 27. GlobalEdgeInformationTransfer 实现版本1:ultralytics/cfg/models/v8/yolov8-GlobalEdgeInformationTransfer1.yaml 实现版本2:ultralytics/cfg/models/v8/yolov8-GlobalEdgeInformationTransfer2.yaml 实现版本3:ultralytics/cfg/models/v8/yolov8-GlobalEdgeInformationTransfer3.yaml 总所周知,物体框的定位非常之依赖物体的边缘信息,但是对于常规的目标检测网络来说,没有任何组件能提高网络对物体边缘信息的关注度,我们需要开发一个能让边缘信息融合到各个尺度所提取的特征中,因此我们提出一个名为GlobalEdgeInformationTransfer(GEIT)的模块,其可以帮助我们把浅层特征中提取到的边缘信息传递到整个backbone上,并与不同尺度的特征进行融合。 1. 由于原始图像中含有大量背景信息,因此从原始图像上直接提取边缘信息传递到整个backbone上会给网络的学习带来噪声,而且浅层的卷积层会帮助我们过滤不必要的背景信息,因此我们选择在网络的浅层开发一个名为MutilScaleEdgeInfoGenetator的模块,其会利用网络的浅层特征层去生成多个尺度的边缘信息特征图并投放到主干的各个尺度中进行融合。 2. 对于下采样方面的选择,我们需要较为谨慎,我们的目标是保留并增强边缘信息,同时进行下采样,选择MaxPool 会更合适。它能够保留局部区域的最强特征,更好地体现边缘信息。因为 AvgPool 更适用于需要平滑或均匀化特征的场景,但在保留细节和边缘信息方面的表现不如 MaxPool。 3. 对于融合部分,ConvEdgeFusion巧妙地结合边缘信息和普通卷积特征,提出了一种新的跨通道特征融合方式。首先,使用conv_channel_fusion进行边缘信息与普通卷积特征的跨通道融合,帮助模型更好地整合不同来源的特征。然后采用conv_3x3_feature_extract进一步提取融合后的特征,以增强模型对局部细节的捕捉能力。最后通过conv_1x1调整输出特征维度。 28. ultralytics/cfg/models/v8/yolov8-C2f-DIMB.yaml 自研模块DynamicInceptionDWConv2d.(详细请看项目内配置文件.md) 29. ultralytics/cfg/models/v8/yolov8-HAFB-1.yaml 自研Hierarchical Attention Fusion Block.(详细请看项目内配置文件.md) 30. ultralytics/cfg/models/v8/yolov8-HAFB-2.yaml HAFB另外一种使用方法. 31. ultralytics/cfg/models/v8/yolov8-MutilBackbone-HAFB.yaml yolov8-MutilBackbone-DAF.yaml基础上用上HAFB. ### BackBone系列 1. ultralytics/cfg/models/v8/yolov8-efficientViT.yaml (CVPR2023)efficientViT替换yolov8主干. 2. ultralytics/cfg/models/v8/yolov8-fasternet.yaml (CVPR2023)fasternet替换yolov8主干. 3. ultralytics/cfg/models/v8/yolov8-timm.yaml 使用timm支持的主干网络替换yolov8主干. 4. ultralytics/cfg/models/v8/yolov8-convnextv2.yaml 使用convnextv2网络替换yolov8主干. 5. ultralytics/cfg/models/v8/yolov8-EfficientFormerV2.yaml 使用EfficientFormerV2网络替换yolov8主干.(需要看[常见错误和解决方案的第五点](#a)) 6. ultralytics/cfg/models/v8/yolov8-vanillanet.yaml vanillanet替换yolov8主干. 7. ultralytics/cfg/models/v8/yolov8-LSKNet.yaml LSKNet(2023旋转目标检测SOTA的主干)替换yolov8主干. 8. ultralytics/cfg/models/v8/yolov8-swintransformer.yaml SwinTransformer-Tiny替换yolov8主干. 9. ultralytics/cfg/models/v8/yolov8-repvit.yaml [RepViT](https://github.com/THU-MIG/RepViT/tree/main)替换yolov8主干. 10. ultralytics/cfg/models/v8/yolov8-CSwinTransformer.yaml 使用[CSWin-Transformer(CVPR2022)](https://github.com/microsoft/CSWin-Transformer/tree/main)替换yolov8主干.(需要看[常见错误和解决方案的第五点](#a)) 11. ultralytics/cfg/models/v8/yolov8-HGNetV2.yaml 使用HGNetV2作为YOLOV8的backbone. 12. ultralytics/cfg/models/v8/yolov8-unireplknet.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)替换yolov8主干. 13. ultralytics/cfg/models/v8/yolov8-TransNeXt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)改进yolov8的backbone.(需要看[常见错误和解决方案的第五点](#a)) 14. ultralytics/cfg/models/rt-detr/yolov8-rmt.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)改进rtdetr的主干. 15. ultralytics/cfg/models/v8/yolov8-pkinet.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)改进backbone.(需要安装mmcv和mmengine) 16. ultralytics/cfg/models/v8/yolov8-mobilenetv4.yaml 使用[MobileNetV4](https://github.com/jaiwei98/MobileNetV4-pytorch/tree/main)改进yolov8-backbone. 17. ultralytics/cfg/models/v8/yolov8-starnet.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)改进yolov8-backbone. 18. ultralytics/cfg/models/v8/yolov8-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOut替换BackBone. 19. ultralytics/cfg/models/v8/yolov8-lsnet.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)中的lsnet替换yolov8的backbone. 20. ultralytics/cfg/models/v8/yolov8-overlock.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的overlock-backbone替换backbone. ### SPPF系列 1. ultralytics/cfg/models/v8/yolov8-FocalModulation.yaml 使用[Focal Modulation](https://github.com/microsoft/FocalNet)替换SPPF. 2. ultralytics/cfg/models/v8/yolov8-SPPF-LSKA.yaml 使用[LSKA](https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention)注意力机制改进SPPF,增强多尺度特征提取能力. 3. ultralytics/cfg/models/v8/yolov8-AIFI.yaml 使用[RT-DETR](https://arxiv.org/pdf/2304.08069.pdf)中的Attention-based Intrascale Feature Interaction(AIFI)改进yolov8. 4. ultralytics/cfg/models/v8/yolov8-AIFIRepBN.yaml 使用[ICML-2024 SLAB](https://github.com/xinghaochen/SLAB)中的RepBN改进AIFI. 5. ultralytics/cfg/models/v8/yolov8-ASSR.yaml 使用[CVPR2025 MambaIR](https://github.com/csguoh/MambaIR)中的Attentive State Space Group改进yolov8. ### Neck系列 1. ultralytics/cfg/models/v8/yolov8-bifpn.yaml 添加BIFPN到yolov8中. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode 支持大部分C2f-XXX结构. 3. head_channel BIFPN中的通道数,默认设置为256. 2. ultralytics/cfg/models/v8/yolov8-slimneck.yaml 使用VoVGSCSP\VoVGSCSPC和GSConv替换yolov8 neck中的C2f和Conv. 3. Asymptotic Feature Pyramid Network[reference](https://github.com/gyyang23/AFPN/tree/master) a. ultralytics/cfg/models/v8/yolov8-AFPN-P345.yaml b. ultralytics/cfg/models/v8/yolov8-AFPN-P345-Custom.yaml c. ultralytics/cfg/models/v8/yolov8-AFPN-P2345.yaml d. ultralytics/cfg/models/v8/yolov8-AFPN-P2345-Custom.yaml 其中Custom中的block支持大部分C2f-XXX结构. 4. ultralytics/cfg/models/v8/yolov8-RCSOSA.yaml 使用[RCS-YOLO](https://github.com/mkang315/RCS-YOLO/tree/main)中的RCSOSA替换C2f. 5. ultralytics/cfg/models/v8/yolov8-goldyolo.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块 6. ultralytics/cfg/models/v8/yolov8-GFPN.yaml 使用[DAMO-YOLO](https://github.com/tinyvision/DAMO-YOLO)中的RepGFPN改进Neck. 7. ultralytics/cfg/models/v8/yolov8-EfficientRepBiPAN.yaml 使用[YOLOV6](https://github.com/meituan/YOLOv6/tree/main)中的EfficientRepBiPAN改进Neck. 8. ultralytics/cfg/models/v8/yolov8-ASF.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion改进yolov8. 9. ultralytics/cfg/models/v8/yolov8-SDI.yaml 使用[U-NetV2](https://github.com/yaoppeng/U-Net_v2)中的 Semantics and Detail Infusion Module对yolov8中的feature fusion部分进行重设计. 10. ultralytics/cfg/models/v8/yolov8-HSFPN.yaml 使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进yolov8的neck. 11. ultralytics/cfg/models/v8/yolov8-CSFCN.yaml 使用[Context and Spatial Feature Calibration for Real-Time Semantic Segmentation](https://github.com/kaigelee/CSFCN/tree/main)中的Context and Spatial Feature Calibration模块改进yolov8. 12. ultralytics/cfg/models/v8/yolov8-CGAFusion.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的content-guided attention fusion改进yolov8-neck. 13. ultralytics/cfg/models/v8/yolov8-SDFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的superficial detail fusion module改进yolov8-neck. 14. ultralytics/cfg/models/v8/yolov8-PSFM.yaml 使用[PSFusion](https://github.com/Linfeng-Tang/PSFusion)中的profound semantic fusion module改进yolov8-neck. 15. ultralytics/cfg/models/v8/yolov8-GLSA.yaml 使用[GLSA](https://github.com/Barrett-python/DuAT)模块改进yolov8的neck. 16. ultralytics/cfg/models/v8/yolov8-CTrans.yaml 使用[[AAAI2022] UCTransNet](https://github.com/McGregorWwww/UCTransNet/tree/main)中的ChannelTransformer改进yolov8-neck.(需要看[常见错误和解决方案的第五点](#a)) 17. ultralytics/cfg/models/v8/yolov8-p6-CTrans.yaml 使用[[AAAI2022] UCTransNet](https://github.com/McGregorWwww/UCTransNet/tree/main)中的ChannelTransformer改进yolov8-neck.(带有p6版本)(需要看[常见错误和解决方案的第五点](#a)) 18. ultralytics/cfg/models/v8/yolov8-MAFPN.yaml 使用[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN改进Neck. 19. Cross-Layer Feature Pyramid Transformer. P345:ultralytics/cfg/models/v8/yolov8-CFPT.yaml P2345:ultralytics/cfg/models/v8/yolov8-CFPT-P2345.yaml P3456:ultralytics/cfg/models/v8/yolov8-CFPT-P3456.yaml P23456:ultralytics/cfg/models/v8/yolov8-CFPT-P23456.yaml 使用[CFPT](https://github.com/duzw9311/CFPT/tree/main)改进neck. 20. ultralytics/cfg/models/v8/yolov8-hyper.yaml 使用[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Hypergraph Computation in Semantic Space改进yolov8. 21. ultralytics/cfg/models/v8/yolov8-msga.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate改进yolov8-neck. 22. ultralytics/cfg/models/v8/yolov8-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade改进yolov8-neck. 23. ultralytics/cfg/models/v8/yolov8-mscafsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention改进yolov8-neck. 24. ultralytics/cfg/models/v8/yolov8-fsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention改进yolov8. 25. ultralytics/cfg/models/v8/yolov8-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM改进neck. 26. ultralytics/cfg/models/v8/yolov8-GDSAFusion.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的GDSAFusion改进neck. 27. ultralytics/cfg/models/v8/yolov8-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE改进YOLOV8-neck. 28. ultralytics/cfg/models/v8/yolov8-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer改进neck. 29. ultralytics/cfg/models/v8/yolov8-HS-FPN.yaml 使用[AAAI2025 HS-FPN](https://github.com/ShiZican/HS-FPN/tree/main)中的HFP和SDP改进yolo-neck. 30. ultralytics/cfg/models/v8/yolov8-LCA.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的LCA改进yolov8-neck. 31. ultralytics/cfg/models/v8/yolov8-HFFE.yaml 使用[TGRS2025 HAFNet](https://ieeexplore.ieee.org/document/11154006)中的HFFE改进yolov8-neck. ### Head系列 1. ultralytics/cfg/models/v8/yolov8-dyhead.yaml 添加基于注意力机制的目标检测头到yolov8中. 2. ultralytics/cfg/models/v8/yolov8-EfficientHead.yaml 对检测头进行重设计,支持10种轻量化检测头.详细请看ultralytics/nn/extra_modules/head.py中的Detect_Efficient class. 3. ultralytics/cfg/models/v8/yolov8-aux.yaml 参考YOLOV7-Aux对YOLOV8添加额外辅助训练头,在训练阶段参与训练,在最终推理阶段去掉. 其中辅助训练头的损失权重系数可在ultralytics/utils/loss.py中的class v8DetectionLoss中的__init__函数中的self.aux_loss_ratio设定,默认值参考yolov7为0.25. 4. ultralytics/cfg/models/v8/yolov8-seg-EfficientHead.yaml(实例分割) 对检测头进行重设计,支持10种轻量化检测头.详细请看ultralytics/nn/extra_modules/head.py中的Detect_Efficient class. 5. ultralytics/cfg/models/v8/yolov8-SEAMHead.yaml 使用[YOLO-Face V2](https://arxiv.org/pdf/2208.02019v2.pdf)中的遮挡感知注意力改进Head,使其有效地处理遮挡场景. 6. ultralytics/cfg/models/v8/yolov8-MultiSEAMHead.yaml 使用[YOLO-Face V2](https://arxiv.org/pdf/2208.02019v2.pdf)中的遮挡感知注意力改进Head,使其有效地处理遮挡场景. 7. ultralytics/cfg/models/v8/yolov8-PGI.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)的programmable gradient information改进YOLOV8.(PGI模块可在训练结束后去掉) 8. Lightweight Asymmetric Detection Head detect:ultralytics/cfg/models/v8/yolov8-LADH.yaml segment:ultralytics/cfg/models/v8/yolov8-seg-LADH.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-LADH.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-LADH.yaml 使用[Faster and Lightweight: An Improved YOLOv5 Object Detector for Remote Sensing Images](https://www.mdpi.com/2072-4292/15/20/4974)中的Lightweight Asymmetric Detection Head改进yolov8-head. 9. Localization Quality Estimation Head 此模块出自[GFocalV2](https://arxiv.org/abs/2011.12885). detect:ultralytics/cfg/models/v8/yolov8-LQEHead.yaml segmet:ultralytics/cfg/models/v8/yolov8-seg-LQE.yaml pose:ultralytics/cfg/models/v8/yolov8-pose-LQE.yaml obb:ultralytics/cfg/models/v8/yolov8-obb-LQE.yaml ### Label Assign系列 1. Adaptive Training Sample Selection匹配策略. 在ultralytics/utils/loss.py中的class v8DetectionLoss中自行选择对应的self.assigner即可. ### PostProcess系列 1. soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU,ShapeIoU) soft-nms替换nms.(建议:仅在val.py时候使用,具体替换请看20240122版本更新说明) 2. ultralytics/cfg/models/v8/yolov8-nmsfree.yaml 仿照yolov10的思想采用双重标签分配和一致匹配度量进行训练,后处理不需要NMS! ### 上下采样算子 1. ultralytics/cfg/models/v8/yolov8-ContextGuidedDown.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided DownSample进行下采样. 2. ultralytics/cfg/models/v8/yolov8-SPDConv.yaml 使用[SPDConv](https://github.com/LabSAINT/SPD-Conv/tree/main)进行下采样. 3. ultralytics/cfg/models/v8/yolov8-dysample.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进yolov8-neck中的上采样. 4. ultralytics/cfg/models/v8/yolov8-CARAFE.yaml 使用[ICCV2019 CARAFE](https://arxiv.org/abs/1905.02188)改进yolov8-neck中的上采样. 5. ultralytics/cfg/models/v8/yolov8-HWD.yaml 使用[Haar wavelet downsampling](https://www.sciencedirect.com/science/article/abs/pii/S0031320323005174)改进yolov8的下采样.(请关闭AMP情况下使用) 6. ultralytics/cfg/models/v8/yolov8-v7DS.yaml 使用[YOLOV7 CVPR2023](https://arxiv.org/abs/2207.02696)的下采样结构改进YOLOV8中的下采样. 7. ultralytics/cfg/models/v8/yolov8-ADown.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)的下采样结构改进YOLOV8中的下采样. 8. ultralytics/cfg/models/v8/yolov8-SRFD.yaml 使用[A Robust Feature Downsampling Module for Remote Sensing Visual Tasks](https://ieeexplore.ieee.org/document/10142024)改进yolov8的下采样. 9. ultralytics/cfg/models/v8/yolov8-WaveletPool.yaml 使用[Wavelet Pooling](https://openreview.net/forum?id=rkhlb8lCZ)改进YOLOV8的上采样和下采样。 10. ultralytics/cfg/models/v8/yolov8-LDConv.yaml 使用[LDConv](https://github.com/CV-ZhangXin/LDConv/tree/main)改进下采样. 11. ultralytics/cfg/models/v8/yolov8-PSConv.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Pinwheel-shaped Convolution改进yolov8. 12. ultralytics/cfg/models/v8/yolov8-EUCB.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB改进yolov8的上采样. 13. ultralytics/cfg/models/v8/yolov8-LoGStem.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LoGStem改进Stem(第一第二层卷积). 14. ultralytics/cfg/models/v8/yolov8-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进Conv. 15. ultralytics/cfg/models/v8/yolov8-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进下采样. 16. ultralytics/cfg/models/v8/yolov8-RepStem.yaml 使用[ICCV2023 FastVit](https://arxiv.org/pdf/2303.14189)中的RepStem改进yolov8下采样. ### YOLOV8-C2f系列 1. ultralytics/cfg/models/v8/yolov8-C2f-Faster.yaml 使用C2f-Faster替换C2f.(使用FasterNet中的FasterBlock替换C2f中的Bottleneck) 2. ultralytics/cfg/models/v8/yolov8-C2f-ODConv.yaml 使用C2f-ODConv替换C2f.(使用ODConv替换C2f中的Bottleneck中的Conv) 3. ultralytics/cfg/models/v8/yolov8-C2f-ODConv.yaml 使用C2f-ODConv替换C2f.(使用ODConv替换C2f中的Bottleneck中的Conv) 4. ultralytics/cfg/models/v8/yolov8-C2f-Faster-EMA.yaml 使用C2f-Faster-EMA替换C2f.(C2f-Faster-EMA推荐可以放在主干上,Neck和head部分可以选择C2f-Faster) 5. ultralytics/cfg/models/v8/yolov8-C2f-DBB.yaml 使用C2f-DBB替换C2f.(使用DiverseBranchBlock替换C2f中的Bottleneck中的Conv) 6. ultralytics/cfg/models/v8/yolov8-C2f-CloAtt.yaml 使用C2f-CloAtt替换C2f.(使用CloFormer中的具有全局和局部特征的注意力机制添加到C2f中的Bottleneck中)(需要看[常见错误和解决方案的第五点](#a)) 7. ultralytics/cfg/models/v8/yolov8-C2f-SCConv.yaml SCConv(CVPR2020 http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf)与C2f融合. 8. ultralytics/cfg/models/v8/yolov8-C2f-SCcConv.yaml ScConv(CVPR2023 https://openaccess.thecvf.com/content/CVPR2023/papers/Li_SCConv_Spatial_and_Channel_Reconstruction_Convolution_for_Feature_Redundancy_CVPR_2023_paper.pdf)与C2f融合. (取名为SCcConv的原因是在windows下命名是不区分大小写的) 9. ultralytics/cfg/models/v8/yolov8-KernelWarehouse.yaml 使用[Towards Parameter-Efficient Dynamic Convolution](https://github.com/OSVAI/KernelWarehouse)添加到yolov8中. 使用此模块需要注意,在epoch0-20的时候精度会非常低,过了20epoch会正常. 10. ultralytics/cfg/models/v8/yolov8-C2f-DySnakeConv.yaml [DySnakeConv](https://github.com/YaoleiQi/DSCNet)与C2f融合. 11. ultralytics/cfg/models/v8/yolov8-C2f-DCNV2.yaml 使用C2f-DCNV2替换C2f.(DCNV2为可变形卷积V2) 12. ultralytics/cfg/models/v8/yolov8-C2f-DCNV3.yaml 使用C2f-DCNV3替换C2f.([DCNV3](https://github.com/OpenGVLab/InternImage)为可变形卷积V3(CVPR2023,众多排行榜的SOTA)) 官方中包含了一些指定版本的DCNV3 whl包,下载后直接pip install xxx即可.具体和安装DCNV3可看百度云链接中的视频. 13. ultralytics/cfg/models/v8/yolov8-C2f-OREPA.yaml 使用C2f-OREPA替换C2f.[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main) 14. ultralytics/cfg/models/v8/yolov8-C2f-REPVGGOREPA.yaml 使用C2f-REPVGGOREPA替换C2f.[Online Convolutional Re-parameterization (CVPR2022)](https://github.com/JUGGHM/OREPA_CVPR2022/tree/main) 15. ultralytics/cfg/models/v8/yolov8-C2f-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)改进C2f.(请关闭AMP进行训练,使用教程请看20240116版本更新说明) 16. ultralytics/cfg/models/v8/yolov8-C2f-ContextGuided.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided改进C2f. 17. ultralytics/cfg/models/v8/yolov8-C2f-MSBlock.yaml 使用[YOLO-MS](https://github.com/FishAndWasabi/YOLO-MS/tree/main)中的MSBlock改进C2f. 18. ultralytics/cfg/models/v8/yolov8-C2f-DLKA.yaml 使用[deformableLKA](https://github.com/xmindflow/deformableLKA)改进C2f. 19. ultralytics/cfg/models/v8/yolov8-C2f-DAttention.yaml 使用[Vision Transformer with Deformable Attention(CVPR2022)](https://github.com/LeapLabTHU/DAT)改进C2f.(需要看[常见错误和解决方案的第五点](#a)) 使用注意点请看百度云视频.(DAttention(Vision Transformer with Deformable Attention CVPR2022)使用注意说明.) 20. 使用[ParC-Net](https://github.com/hkzhang-git/ParC-Net/tree/main)中的ParC_Operator改进C2f.(需要看[常见错误和解决方案的第五点](#a)) 使用注意点请看百度云视频.(20231031更新说明) 21. ultralytics/cfg/models/v8/yolov8-C2f-DWR.yaml 使用[DWRSeg](https://arxiv.org/abs/2212.01173)中的Dilation-wise Residual(DWR)模块,加强从网络高层的可扩展感受野中提取特征. 22. ultralytics/cfg/models/v8/yolov8-C2f-RFAConv.yaml 使用[RFAConv](https://github.com/Liuchen1997/RFAConv/tree/main)中的RFAConv改进yolov8. 23. ultralytics/cfg/models/v8/yolov8-C2f-RFCBAMConv.yaml 使用[RFAConv](https://github.com/Liuchen1997/RFAConv/tree/main)中的RFCBAMConv改进yolov8. 24. ultralytics/cfg/models/v8/yolov8-C2f-RFCAConv.yaml 使用[RFAConv](https://github.com/Liuchen1997/RFAConv/tree/main)中的RFCAConv改进yolov8. 25. ultralytics/cfg/models/v8/yolov8-C2f-FocusedLinearAttention.yaml 使用[FLatten Transformer(ICCV2023)](https://github.com/LeapLabTHU/FLatten-Transformer)中的FocusedLinearAttention改进C2f.(需要看[常见错误和解决方案的第五点](#a)) 使用注意点请看百度云视频.(20231114版本更新说明.) 26. ultralytics/cfg/models/v8/yolov8-C2f-MLCA.yaml 使用[Mixed Local Channel Attention 2023](https://github.com/wandahangFY/MLCA/tree/master)改进C2f.(用法请看百度云视频-20231129版本更新说明) 27. ultralytics/cfg/models/v8/yolov8-C2f-AKConv.yaml 使用[AKConv 2023](https://github.com/CV-ZhangXin/AKConv)改进C2f.(用法请看百度云视频-20231129版本更新说明) 28. ultralytics/cfg/models/v8/yolov8-C2f-UniRepLKNetBlock.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的UniRepLKNetBlock改进C2f. 29. ultralytics/cfg/models/v8/yolov8-C2f-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock改进C2f. 30. ultralytics/cfg/models/v8/yolov8-C2f-AggregatedAtt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)中的聚合感知注意力改进C2f.(需要看[常见错误和解决方案的第五点](#a)) 31. ultralytics/cfg/models/v8/yolov8-C2f-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)改进yolov8中的C2f. 32. ultralytics/cfg/models/v8/yolov8-C2f-iRMB.yaml 使用[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB改进C2f. 33. ultralytics/cfg/models/v8/yolov8-C2f-VSS.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)对C2f中的BottleNeck进行改进,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 34. ultralytics/cfg/models/v8/yolov8-C2f-LVMB.yaml 使用最新的Mamba架构[Mamba-UNet中的VSS](https://github.com/ziyangwang007/Mamba-UNet)与Cross Stage Partial进行结合,使其能更有效地捕获图像中的复杂细节和更广泛的语义上下文. 35. ultralytics/cfg/models/v8/yolov8-RepNCSPELAN.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)中的RepNCSPELAN进行改进yolov8. 36. ultralytics/cfg/models/v8/yolov8-C2f-DynamicConv.yaml 使用[CVPR2024 parameternet](https://arxiv.org/pdf/2306.14525v2.pdf)中的DynamicConv改进C2f. 37. ultralytics/cfg/models/v8/yolov8-C2f-GhostDynamicConv.yaml 使用[CVPR2024 parameternet](https://arxiv.org/pdf/2306.14525v2.pdf)中的GhostModule改进C2f. 38. ultralytics/cfg/models/v8/yolov8-C2f-RVB.yaml 使用[CVPR2024 RepViT](https://github.com/THU-MIG/RepViT/tree/main)中的RepViTBlock改进C2f. 39. ultralytics/cfg/models/v8/yolov8-DGCST.yaml 使用[Lightweight Object Detection](https://arxiv.org/abs/2403.01736)中的Dynamic Group Convolution Shuffle Transformer改进yolov8. 40. ultralytics/cfg/models/v8/yolov8-C2f-RetBlock.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)中的RetBlock改进C2f. 41. ultralytics/cfg/models/v8/yolov8-C2f-PKI.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的PKIModule和CAA模块改进C2f. 42. ultralytics/cfg/models/v8/yolov8-RepNCSPELAN_CAA.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块改进RepNCSPELAN. 43. ultralytics/cfg/models/v8/yolov8-C2f-fadc.yaml 使用[CVPR2024 Frequency-Adaptive Dilated Convolution](https://github.com/Linwei-Chen/FADC)改进C2f. 44. ultralytics/cfg/models/v8/yolov8-C2f-PPA.yaml 使用[HCFNet](https://github.com/zhengshuchen/HCFNet)中的Parallelized Patch-Aware Attention Module改进C2f. 45. ultralytics/cfg/models/v8/yolov8-C2f-Star.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock改进C2f. 46. ultralytics/cfg/models/v8/yolov8-C2f-KAN.yaml KAN In! Mamba Out! Kolmogorov-Arnold Networks. 目前支持: 1. FastKANConv2DLayer 2. KANConv2DLayer 3. KALNConv2DLayer 4. KACNConv2DLayer 5. KAGNConv2DLayer 47. ultralytics/cfg/models/v8/yolov8-C2f-DEConv.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的detail-enhanced convolution改进C2f. 48. ultralytics/cfg/models/v8/yolov8-C2f-Heat.yaml 使用[vHeat](https://github.com/MzeroMiko/vHeat/tree/main)中的HeatBlock改进C2f. 49. ultralytics/cfg/models/v8/yolov8-C2f-WTConv.yaml 使用[ECCV2024 Wavelet Convolutions for Large Receptive Fields](https://github.com/BGU-CS-VIL/WTConv)中的WTConv改进C2f-BottleNeck. 50. ultralytics/cfg/models/v8/yolov8-C2f-FMB.yaml 使用[ECCV2024 SMFANet](https://github.com/Zheng-MJ/SMFANet/tree/main)的Feature Modulation block改进C2f. 51. ultralytics/cfg/models/v8/yolov8-C2f-gConv.yaml 使用[Rethinking Performance Gains in Image Dehazing Networks](https://arxiv.org/abs/2209.11448)的gConvblock改进C2f. 52. ultralytics/cfg/models/v8/yolov8-C2f-WDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的WDBB改进c2f. 53. ultralytics/cfg/models/v8/yolov8-C2f-DeepDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的DeepDBB改进c2f. 54. ultralytics/cfg/models/v8/yolov8-C2f-AdditiveBlock.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock改进c2f. 55. ultralytics/cfg/models/v8/yolov8-C2f-MogaBlock.yaml 使用[MogaNet ICLR2024](https://github.com/Westlake-AI/MogaNet)中的MogaBlock改进C2f. 56. ultralytics/cfg/models/v8/yolov8-C2f-IdentityFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的IdentityFormer改进c2f. 57. ultralytics/cfg/models/v8/yolov8-C2f-RandomMixing.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的RandomMixingFormer改进c2f.(需要看[常见错误和解决方案的第五点](#a)) 58. ultralytics/cfg/models/v8/yolov8-C2f-PoolingFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的PoolingFormer改进c2f. 59. ultralytics/cfg/models/v8/yolov8-C2f-ConvFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的ConvFormer改进c2f. 60. ultralytics/cfg/models/v8/yolov8-C2f-CaFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的CaFormer改进c2f. 61. ultralytics/cfg/models/v8/yolov8-C2f-SFHF.yaml 使用[SFHformer ECCV2024](https://github.com/deng-ai-lab/SFHformer)中的block改进C2f. 62. ultralytics/cfg/models/v8/yolov8-C2f-MSM.yaml 使用[Revitalizing Convolutional Network for Image Restoration TPAMI2024](https://zhuanlan.zhihu.com/p/720777160)中的MSM改进C2f. 63. ultralytics/cfg/models/v8/yolov8-C2f-RAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的HDRAB(hybrid dilated residual attention block)改进C2f. 64. ultralytics/cfg/models/v8/yolov8-C2f-HDRAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的RAB( residual attention block)改进C2f. 65. ultralytics/cfg/models/v8/yolov8n-C2f-LFE.yaml 使用[Efficient Long-Range Attention Network for Image Super-resolution ECCV2022](https://github.com/xindongzhang/ELAN)中的Local feature extraction改进C2f. 66. ultralytics/cfg/models/v8/yolov8-C2f-SFA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention-SFA改进C2f. 67. ultralytics/cfg/models/v8/yolov8-C2f-CTA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention-CTA改进C2f. 68. ultralytics/cfg/models/v8/yolov8-C2f-CAMixer.yaml 使用[CAMixerSR CVPR2024](https://github.com/icandle/CAMixerSR)中的CAMixer改进C2f. 69. ultralytics/cfg/models/v8/yolov8-MAN.yaml 使用[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Mixed Aggregation Network改进yolov8. 70. ultralytics/cfg/models/v8/yolov8-C2f-HFERB.yaml 使用[ICCV2023 CRAFT-SR](https://github.com/AVC2-UESTC/CRAFT-SR)中的high-frequency enhancement residual block改进C2f. 71. ultralytics/cfg/models/v8/yolov8-C2f-DTAB.yaml 使用[AAAI2025 TBSN](https://github.com/nagejacob/TBSN)中的DTAB改进C2f. 72. ultralytics/cfg/models/v8/yolov8-C2f-JDPM.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的joint domain perception module改进C2f. 73. ultralytics/cfg/models/v8/yolov8-C2f-ETB.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的entanglement transformer block改进C2f. 74. ultralytics/cfg/models/v8/yolov8-C2f-AP.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Asymmetric Padding bottleneck改进C2f. 75. ultralytics/cfg/models/v8/yolov8-C2f-Strip.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock改进C2f. 76. ultralytics/cfg/models/v8/yolov8-C2f-Kat.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAT改进C2f. 77. ultralytics/cfg/models/v8/yolov8-C2f-GlobalFilter.yaml 使用[T-PAMI Global Filter Networks for Image Classification](https://github.com/raoyongming/GFNet)中的GlobalFilterBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2f. 78. ultralytics/cfg/models/v8/yolov8-C2f-DynamicFilter.yaml 使用[AAAI2024 FFT-Based Dynamic Token Mixer for Vision](https://github.com/okojoalg/dfformer)中的DynamicFilter改进C2f. 79. ultralytics/cfg/models/v8/yolov8-RepHMS.yaml 使用[MHAF-YOLO](https://github.com/yang-0201/MHAF-YOLO)中的RepHMS改进yolov8. 80. ultralytics/cfg/models/v8/yolov8-C2f-SAVSS.yaml 使用[CVPR2025 SCSegamba](https://github.com/Karl1109/SCSegamba)中的Structure-Aware Scanning Strategy改进C2f. 81. ultralytics/cfg/models/v8/yolov8-C2f-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock改进C2f. 82. ultralytics/cfg/models/v8/yolov8-C2f-EfficientVIM.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock改进C2f. 83. ultralytics/cfg/models/v8/yolov8-C2f-LEGM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的LEGM改进C2f. 84. ultralytics/cfg/models/v8/yolov8-C2f-LSBlock.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)中的LSBlock改进C2f. 85. ultralytics/cfg/models/v8/yolov8-C2f-LFEM.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LFEModule改进C2f. 86. ultralytics/cfg/models/v8/yolov8-C2f-RCB.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的RepConvBlock改进C2f. 87. ultralytics/cfg/models/v8/yolov8-C2f-TransMamba.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的TransMamba改进C2f 88. ultralytics/cfg/models/v8/yolov8-C2f-EVS.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EVS改进C2f 89. ultralytics/cfg/models/v8/yolov8-C2f-EBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的EBlock改进C2f. 90. ultralytics/cfg/models/v8/yolov8-C2f-DBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的DBlock改进C2f. 91. ultralytics/cfg/models/v8/yolov8-C2f-SFSConv.yaml 使用[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv改进C2f. 92. ultralytics/cfg/models/v8/yolov8-FCM.yaml 使用[AAAI2025 FBRT-YOLO](https://github.com/galaxy-oss/FCM)的模块改进yolov8. 93. ultralytics/cfg/models/v8/yolov8-C2f-GroupMamba.yaml 使用[CVPR2025 GroupMamba](https://github.com/Amshaker/GroupMamba)中的GroupMambaBlock改进C2f. 94. ultralytics/cfg/models/v8/yolov8-C2f-MambaVision.yaml 使用[CVPR2025 MambaVision](https://github.com/NVlabs/MambaVision)中的MambaVision改进C2f. 95. ultralytics/cfg/models/v8/yolov8-C2f-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进C2f. 96. ultralytics/cfg/models/v8/yolov8-C2f-GLVSS.yaml 使用[TGRS2025 UMFormer](https://github.com/takeyoutime/UMFormer)中的GLVSS改进C2f. 97. ultralytics/cfg/models/v8/yolov8-C2f-ESC.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ESC改进C2f. 98. ultralytics/cfg/models/v8/yolov8-C2f-ConvAttn.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ConvAttn改进C2f. 99. ultralytics/cfg/models/v8/yolov8-C2f-UniConv.yaml 使用[ICCV2025 UniConvBlock](https://github.com/ai-paperwithcode/UniConvNet)中的UniConvBlock改进C2f. 100. ultralytics/cfg/models/v8/yolov8-C2f-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进C2f. 101. ultralytics/cfg/models/v8/yolov8-C2f-CFBlock.yaml 使用[AAAI2024 SCTNet](https://arxiv.org/pdf/2312.17071)中的CFBlock改进C2f. 102. ultralytics/cfg/models/v8/yolov8-C2f-CSSC.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CSSC改进C2f. 103. ultralytics/cfg/models/v8/yolov8-C2f-CNCM.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CNCM改进C2f. 104. ultralytics/cfg/models/v8/yolov8-C2f-HFRB.yaml 使用[ICCV2025 HFRB](https://arxiv.org/pdf/2507.10689)中的HFRB改进C2f. 105. ultralytics/cfg/models/v8/yolov8-C2f-EVA.yaml 使用[ICIP2025 BEVANET](https://arxiv.org/pdf/2508.07300)中的EVA改进C2f. 106. ultralytics/cfg/models/v8/yolov8-C2f-RMBC.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv改进C2f. 107. ultralytics/cfg/models/v8/yolov8-C2f-RMBC-LA.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv和Local Importance-based Attention改进C2f. 108. ultralytics/cfg/models/v8/yolov8-C2f-IEL.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的IEL改进C2f. ### 组合系列 1. ultralytics/cfg/models/v8/yolov8-fasternet-bifpn.yaml fasternet与bifpn的结合. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode 其中目前(后续会更新喔)支持这些[结构](#b) 3. head_channel BIFPN中的通道数,默认设置为256. 2. ultralytics/cfg/models/v8/yolov8-ELA-HSFPN-TADDH.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN,使用自研动态动态对齐检测头改进Head. 3. ultralytics/cfg/models/v8/yolov8-FDPN-TADDH.yaml 自研结构的融合. 1. 自研特征聚焦扩散金字塔网络(Focusing Diffusion Pyramid Network) 2. 自研任务对齐动态检测头(Task Align Dynamic Detection Head) 4. ultralytics/cfg/models/v8/yolov8-starnet-C2f-Star-LSCD.yaml 轻量化模型组合. 1. CVPR2024-StarNet Backbone. 2. C2f-Star. 3. Lightweight Shared Convolutional Detection Head. ## YOLOV10系列 #### 以下配置文件都基于v10n,如果需要使用其他大小的模型(s,m,b,l,x)可以看项目视频百度云链接-YOLOV10模型大小切换教程. ### 二次创新系列 1. SlideLoss and EMASlideLoss.[Yolo-Face V2](https://github.com/Krasjet-Yu/YOLO-FaceV2/blob/master/utils/loss.py) 在ultralytics/utils/loss.py中的class v8DetectionLoss进行设定. 2. ultralytics/cfg/models/v10/yolov10n-RevCol.yaml 使用[(ICLR2023)Reversible Column Networks](https://github.com/megvii-research/RevCol)对yolov10主干进行重设计,里面的支持更换不同的C2f-Block. 3. ultralytics/cfg/models/v10/yolov10n-BIMAFPN.yaml 利用BIFPN的思想对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次改进得到BIMAFPN. 4. ultralytics/cfg/models/v10/yolov10n-C2f-AdditiveBlock-CGLU.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进c2f. 5. ultralytics/cfg/models/v10/yolov10n-ASF-P2.yaml 在ultralytics/cfg/models/v8/yolov8-ASF.yaml的基础上进行二次创新,引入P2检测层并对网络结构进行优化. 6. ultralytics/cfg/models/v10/yolov10n-ASF-DySample.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion与[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)组合得到Dynamic Sample Attentional Scale Sequence Fusion. 7. ultralytics/cfg/models/v10/yolov10n-goldyolo-asf.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute与[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion进行二次创新改进yolov10的neck. 8. ultralytics/cfg/models/v10/yolov10n-C2f-MSMHSA-CGLU.yaml 使用[CMTFNet](https://github.com/DrWuHonglin/CMTFNet/tree/main)中的M2SA和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进c2f. 9. ultralytics/cfg/models/v10/yolov10n-C2f-IdentityFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的IdentityFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 10. ultralytics/cfg/models/v10/yolov10n-C2f-RandomMixing-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的RandomMixing和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 11. ultralytics/cfg/models/v10/yolov10n-C2f-PoolingFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的PoolingFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 12. ultralytics/cfg/models/v10/yolov10n-C2f-ConvFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的ConvFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 13. ultralytics/cfg/models/v10/yolov10n-C2f-CaFormer-CGLU.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的CaFormer和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的CGLU改进c2f. 14. ultralytics/cfg/models/v10/yolov10n-dyhead-DCNV3.yaml 使用[DCNV3](https://github.com/OpenGVLab/InternImage)替换DyHead中的DCNV2. 15. ultralytics/cfg/models/v10/yolov10n-dyhead-DCNV4.yaml 使用[DCNV4](https://github.com/OpenGVLab/DCNv4)对DyHead进行二次创新. 16. ultralytics/cfg/models/v10/yolov10n-C2f-iRMB-Cascaded.yaml 使用[EfficientViT CVPR2023](https://github.com/microsoft/Cream/tree/main/EfficientViT)中的CascadedGroupAttention对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C2f. 17. ultralytics/cfg/models/v10/yolov10n-C2f-iRMB-DRB.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)中的DilatedReparamBlock对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C2f. 18. ultralytics/cfg/models/v10/yolov10n-C2f-iRMB-SWC.yaml 使用[shift-wise conv](https://arxiv.org/abs/2401.12736)对[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB进行二次创新来改进C2f. 19. ultralytics/cfg/models/v10/yolov10n-ELA-HSFPN.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN. 20. ultralytics/cfg/models/v10/yolov10n-CA-HSFPN.yaml 使用[Coordinate Attention CVPR2021](https://github.com/houqb/CoordAttention)改进HSFPN. 21. ultralytics/cfg/models/v10/yolov10n-CAA-HSFPN.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)中的CAA模块HSFPN. 22. ultralytics/cfg/models/v10/yolov10n-MAN-Faster.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block进行二次创新改进yolov10. 23. ultralytics/cfg/models/v10/yolov10n-MAN-FasterCGLU.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进yolov10. 24. ultralytics/cfg/models/v10/yolov10n-MAN-Star.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)中的StarBlock进行二次创新改进yolov10. 25. ultralytics/cfg/models/v10/yolov10n-MutilBackbone-MSGA.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate对自研系列MutilBackbone再次创新. 26. ultralytics/cfg/models/v10/yolov10n-slimneck-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade对slimneck二次创新. 27. ultralytics/cfg/models/v10/yolov10n-MAN-FasterCGLU-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade和[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的 Mixed Aggregation Network和[FasterNet CVPR2023](https://github.com/JierunChen/FasterNet)中的Faster-Block和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU进行二次创新改进yolov10. 28. ultralytics/cfg/models/v10/yolov10n-CDFA.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的WaveletConv与[AAAI2025 ConDSeg](https://github.com/Mengqi-Lei/ConDSeg)的ContrastDrivenFeatureAggregation结合改进yolov10. 29. ultralytics/cfg/models/v10/yolov10n-C2f-StripCGLU.yaml 使用[Strip R-CNN](https://arxiv.org/pdf/2501.03775)中的StripBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2f. 30. ultralytics/cfg/models/v10/yolov10n-C2f-Faster-KAN.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN对(CVPR2023)fasternet中的FastetBlock进行二次创新. 31. ultralytics/cfg/models/v10/yolov10n-C2f-DIMB-KAN.yaml 在yolov10n-C2f-DIMB.yaml的基础上把mlp模块换成[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAN. 32. ultralytics/cfg/models/v10/yolov10n-C2f-EfficientVIM-CGLU.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2f. 33. ultralytics/cfg/models/v10/yolov10n-LSCD-LQE.yaml Localization Quality Estimation Head-LSCD-NMSFree,Localization Quality Estimation此模块出自[GFocalV2](https://arxiv.org/abs/2011.12885). 34. ultralytics/cfg/models/v10/yolov10n-EUCB-SC.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB和[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix改进yolov10的上采样. 35. ultralytics/cfg/models/v10/yolov10n-EMBSFPN-SC.yaml 在ultralytics/cfg/models/v10/yolov10n-EMBSFPN.yaml方案上引入[CVPR2025 BHViT](https://github.com/IMRL/BHViT)中的ShiftChannelMix. 36. ultralytics/cfg/models/v10/yolov10n-MFMMAFPN.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN进行二次创新. 37. ultralytics/cfg/models/v10/yolov10n-MBSMFFPN.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对yolov10n-EMBSFPN.yaml再次创新 Multi-Branch&Scale Modulation-Fusion FPN. 38. ultralytics/cfg/models/v10/yolov10n-C2f-mambaout-LSConv.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)的LSConv与[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock二次创新后改进C2f. 39. ultralytics/cfg/models/v10/yolov10n-SOEP-RFPN-MFM.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE和[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM对原创改进SOEP再次创新. 40. ultralytics/cfg/models/v10/yolov10n-SOEP-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer改进SOEP. 41. ultralytics/cfg/models/v10/yolov10n-MAN-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Mixed Aggregation Network. ### 自研系列 1. ultralytics/cfg/models/v10/yolov10n-C2f-EMSC.yaml Efficient Multi-Scale Conv.自研模块,具体讲解请看百度云链接中的视频. 2. ultralytics/cfg/models/v10/yolov10n-C2f-EMSCP.yaml Efficient Multi-Scale Conv Plus.自研模块,具体讲解请看百度云链接中的视频. 3. ultralytics/cfg/models/v10/yolov10n-LAWDS.yaml Light Adaptive-weight downsampling.自研模块,具体讲解请看百度云链接中的视频. 4. ultralytics/cfg/models/v10/yolov10n-LSCD.yaml 自研轻量化检测头.(Lightweight Shared Convolutional Detection Head) 1. GroupNorm在FCOS论文中已经证实可以提升检测头定位和分类的性能. 2. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上. 3. 在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 综合以上,我们可以让检测头做到参数量更少、计算量更少的情况下,尽可能减少精度的损失. 5. ultralytics/cfg/models/v10/yolov10n-CGRFPN.yaml Context-Guided Spatial Feature Reconstruction Feature Pyramid Network. 1. 借鉴[ECCV2024-CGRSeg](https://github.com/nizhenliang/CGRSeg)中的Rectangular Self-Calibration Module经过精心设计,用于空间特征重建和金字塔上下文提取,它在水平和垂直方向上捕获全局上下文,并获得轴向全局上下文来显式地建模矩形关键区域. 2. PyramidContextExtraction Module使用金字塔上下文提取模块(PyramidContextExtraction),有效整合不同层级的特征信息,提升模型的上下文感知能力。 3. FuseBlockMulti 和 DynamicInterpolationFusion 这些模块用于多尺度特征的融合,通过动态插值和多特征融合,进一步提高了模型的多尺度特征表示能力和提升模型对复杂背景下目标的识别能力。 6. ultralytics/cfg/models/v10/yolov10n-FeaturePyramidSharedConv.yaml 1. 多尺度特征提取 通过使用不同膨胀率的卷积层,模块能够提取不同尺度的特征。这对捕捉图像中不同大小和不同上下文的信息非常有利。 低膨胀率捕捉局部细节,高膨胀率捕捉全局上下文。 2. 参数共享 使用共享的卷积层 self.share_conv,大大减少了需要训练的参数数量。相比于每个膨胀率使用独立的卷积层,共享卷积层能够减少冗余,提升模型效率。 减少了模型的存储和计算开销,提升了计算效率。 3. 高效的通道变换 通过1x1卷积层 self.cv1 和 self.cv2,模块能够高效地调整通道数,并进行特征融合。1x1卷积层在减少参数量的同时还能保留重要的特征信息。 4. 更细粒度的特征提取 FeaturePyramidSharedConv 使用卷积操作进行特征提取,能够捕捉更加细粒度的特征。相比之下,SPPF 的池化操作可能会丢失一些细节信息。 卷积操作在特征提取时具有更高的灵活性和表达能力,可以更好地捕捉图像中的细节和复杂模式。 7. APT(Adaptive Power Transformation)-TAL. 为了使不同gt预测对的匹配质量和损失权重更具鉴别性,我们通过自定义的PowerTransformer显著增强高质量预测框的权重,抑制低质量预测框的影响,并使模型在学习的过程可以更关注质量高的预测框。 8. ultralytics/cfg/models/v10/yolov10n-SOEP.yaml 小目标在正常的P3、P4、P5检测层上略显吃力,比较传统的做法是加上P2检测层来提升小目标的检测能力,但是同时也会带来一系列的问题,例如加上P2检测层后计算量过大、后处理更加耗时等问题,日益激发需要开发新的针对小目标有效的特征金字塔,我们基于原本的PAFPN上进行改进,提出SmallObjectEnhancePyramid,相对于传统的添加P2检测层,我们使用P2特征层经过SPDConv得到富含小目标信息的特征给到P3进行融合,然后使用CSP思想和基于[AAAI2024的OmniKernel](https://ojs.aaai.org/index.php/AAAI/article/view/27907)进行改进得到CSP-OmniKernel进行特征整合,OmniKernel模块由三个分支组成,即三个分支,即全局分支、大分支和局部分支、以有效地学习从全局到局部的特征表征,最终从而提高小目标的检测性能。 9. ultralytics/cfg/models/v10/yolov10n-EMBSFPN.yaml 基于BIFPN、[MAF-YOLO](https://arxiv.org/pdf/2407.04381)、[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)提出全新的Efficient Multi-Branch&Scale FPN. Efficient Multi-Branch&Scale FPN拥有<轻量化>、<多尺度特征加权融合>、<多尺度高效卷积模块>、<高效上采样模块>、<全局异构核选择机制>。 1. 具有多尺度高效卷积模块和全局异构核选择机制,Trident网络的研究表明,具有较大感受野的网络更适合检测较大的物体,反之,较小尺度的目标则从较小的感受野中受益,因此我们在FPN阶段,对于不同尺度的特征层选择不同的多尺度卷积核以适应并逐步获得多尺度感知场信息。 2. 借鉴BIFPN中的多尺度特征加权融合,能把Concat换成Add来减少参数量和计算量的情况下,还能通过不同尺度特征的重要性进行自适用选择加权融合。 3. 高效上采样模块来源于CVPR2024-EMCAD中的EUCB,能够在保证一定效果的同时保持高效性。 10. ultralytics/cfg/models/v10/yolov10n-CSP-PMSFA.yaml 自研模块:CSP-Partial Multi-Scale Feature Aggregation. 1. 部分多尺度特征提取:参考CVPR2020-GhostNet、CVPR2024-FasterNet的思想,采用高效的PartialConv,该模块能够从输入中提取多种尺度的特征信息,但它并不是在所有通道上进行这种操作,而是部分(Partial)地进行,从而提高了计算效率。 2. 增强的特征融合: 最后的 1x1 卷积层通过将不同尺度的特征融合在一起,同时使用残差连接将输入特征与处理后的特征相加,有效保留了原始信息并引入了新的多尺度信息,从而提高模型的表达能力。 11. ultralytics/cfg/models/v10/yolov10n-MutilBackbone-DAF.yaml 自研MutilBackbone-DynamicAlignFusion. 1. 为了避免在浅层特征图上消耗过多计算资源,设计的MutilBackbone共享一个stem的信息,这个设计有利于避免计算量过大,推理时间过大的问题。 2. 为了避免不同Backbone信息融合出现不同来源特征之间的空间差异,我们为此设计了DynamicAlignFusion,其先通过融合来自两个不同模块学习到的特征,然后生成一个名为DynamicAlignWeight去调整各自的特征,最后使用一个可学习的通道权重,其可以根据输入特征动态调整两条路径的权重,从而增强模型对不同特征的适应能力。 12. ultralytics/cfg/models/v10/yolov10n-TADDH.yaml 自研任务对齐动态检测头 1. GroupNorm在FCOS论文中已经证实可以提升检测头定位和分类的性能. 2. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上.并且在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 3. 参照TOOD的思想,除了标签分配策略上的任务对齐,我们也在检测头上进行定制任务对齐的结构,现有的目标检测器头部通常使用独立的分类和定位分支,这会导致两个任务之间缺乏交互,TADDH通过特征提取器从多个卷积层中学习任务交互特征,得到联合特征,定位分支使用DCNV2和交互特征生成DCNV2的offset和mask,分类分支使用交互特征进行动态特征选择. 13. ultralytics/cfg/models/v10/yolov10n-C2f-MutilScaleEdgeInformationEnhance.yaml 自研CSP-MutilScaleEdgeInformationEnhance. MutilScaleEdgeInformationEnhance模块结合了多尺度特征提取、边缘信息增强和卷积操作。它的主要目的是从不同尺度上提取特征,突出边缘信息,并将这些多尺度特征整合到一起,最后通过卷积层输出增强的特征。这个模块在特征提取和边缘增强的基础上有很好的表征能力. 1. 多尺度特征提取:通过 nn.AdaptiveAvgPool2d 进行多尺度的池化,提取不同大小的局部信息,有助于捕捉图像的多层次特征。 2. 边缘增强:EdgeEnhancer 模块专门用于提取边缘信息,使得网络对边缘的敏感度增强,这对许多视觉任务(如目标检测、语义分割等)有重要作用。 3. 特征融合:将不同尺度下提取的特征通过插值操作对齐到同一尺度,然后将它们拼接在一起,最后经过卷积层融合成统一的特征表示,能够提高模型对多尺度特征的感知。 14. ultralytics/cfg/models/v10/yolov10n-RSCD.yaml 自研重参数轻量化检测头.(Rep Shared Convolutional Detection Head) 1. 通过使用共享卷积,可以大幅减少参数数量,这使得模型更轻便,特别是在资源受限的设备上.但由于共享参数可能限制模型的表达能力,因为不同特征可能需要不同的卷积核来捕捉复杂的模式。共享参数可能无法充分捕捉这些差异。为了尽量弥补实现轻量化所采取的共享卷积带来的负面影响,我们使用可重参数化卷积,通过引入更多的可学习参数,网络可以更有效地从数据中提取特征,进而弥补轻量化模型后可能带来的精度丢失问题,并且重参数化卷积可以大大提升参数利用率,并且在推理阶段与普通卷积无差,为模型带来无损的优化方案。 2. 在使用共享卷积的同时,为了应对每个检测头所检测的目标尺度不一致的问题,使用Scale层对特征进行缩放. 15. ultralytics/cfg/models/v10/yolov10n-CSP-FreqSpatial.yaml FreqSpatial 是一个融合时域和频域特征的卷积神经网络(CNN)模块。该模块通过在时域和频域中提取特征,旨在捕捉不同层次的空间和频率信息,以增强模型在处理图像数据时的鲁棒性和表示能力。模块的主要特点是将 Scharr 算子(用于边缘检测)与 时域卷积 和 频域卷积 结合,通过多种视角捕获图像的结构特征。 1. 时域特征提取:从原始图像中提取出基于空间结构的特征,主要捕捉图像的细节、边缘信息等。 2. 频域特征提取:从频率域中提取出频率相关的模式,捕捉到图像的低频和高频成分,能够帮助模型在全局和局部的尺度上提取信息。 3. 特征融合:将时域和频域的特征进行加权相加,得到最终的输出特征图。这种加权融合允许模型同时考虑空间结构信息和频率信息,从而增强模型在多种场景下的表现能力。 16. ultralytics/cfg/models/v10/yolov10n-C2f-MutilScaleEdgeInformationSelect.yaml 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新. 我们提出了一个 多尺度边缘信息选择模块(MutilScaleEdgeInformationSelect),其目的是从多尺度边缘信息中高效选择与目标任务高度相关的关键特征。为了实现这一目标,我们引入了一个具有通过聚焦更重要的区域能力的注意力机制[ICCV2023 DualDomainSelectionMechanism, DSM](https://github.com/c-yn/FocalNet)。该机制通过聚焦图像中更重要的区域(如复杂边缘和高频信号区域),在多尺度特征中自适应地筛选具有更高任务相关性的特征,从而显著提升了特征选择的精准度和整体模型性能。 17. ultralytics/cfg/models/v10/yolov10n-LSDECD.yaml 基于自研轻量化检测头上(LSCD),使用detail-enhanced convolution进一步改进,提高检测头的细节捕获能力,进一步改善检测精度. 关于DEConv在运行的时候重参数化后比重参数化前的计算量还要大的问题:是因为重参数化前thop库其计算不准的问题,看重参数化后的参数即可. 1. DEA-Net中设计了一个细节增强卷积(DEConv),具体来说DEConv将先验信息整合到普通卷积层,以增强表征和泛化能力。然后,通过使用重参数化技术,DEConv等效地转换为普通卷积,不需要额外的参数和计算成本。 18. ultralytics/cfg/models/v10/yolov10n-ContextGuideFPN.yaml Context Guide Fusion Module(CGFM)是一个创新的特征融合模块,旨在改进YOLOv8中的特征金字塔网络(FPN)。该模块的设计考虑了多尺度特征融合过程中上下文信息的引导和自适应调整。 1. 上下文信息的有效融合:通过SE注意力机制,模块能够在特征融合过程中捕捉并利用重要的上下文信息,从而增强特征表示的有效性,并有效引导模型学习检测目标的信息,从而提高模型的检测精度。 2. 特征增强:通过权重化的特征重组操作,模块能够增强重要特征,同时抑制不重要特征,提升特征图的判别能力。 3. 简单高效:模块结构相对简单,不会引入过多的计算开销,适合在实时目标检测任务中应用。 19. Re-CalibrationFPN 为了加强浅层和深层特征的相互交互能力,推出重校准特征金字塔网络(Re-CalibrationFPN). P2345:ultralytics/cfg/models/v10/yolov10n-ReCalibrationFPN-P2345.yaml(带有小目标检测头的ReCalibrationFPN) P345:ultralytics/cfg/models/v10/yolov10n-ReCalibrationFPN-P345.yaml P3456:ultralytics/cfg/models/v10/yolov10n-ReCalibrationFPN-P3456.yaml(带有大目标检测头的ReCalibrationFPN) 1. 浅层语义较少,但细节丰富,有更明显的边界和减少失真。此外,深层蕴藏着丰富的物质语义信息。因此,直接融合低级具有高级特性的特性可能导致冗余和不一致。为了解决这个问题,我们提出了[SBA](https://github.com/Barrett-python/DuAT)模块,它有选择地聚合边界信息和语义信息来描绘更细粒度的物体轮廓和重新校准物体的位置。 2. 相比传统的FPN结构,[SBA](https://github.com/Barrett-python/DuAT)模块引入了高分辨率和低分辨率特征之间的双向融合机制,使得特征之间的信息传递更加充分,进一步提升了多尺度特征融合的效果。 3. [SBA](https://github.com/Barrett-python/DuAT)模块通过自适应的注意力机制,根据特征图的不同分辨率和内容,自适应地调整特征的权重,从而更好地捕捉目标的多尺度特征。 20. ultralytics/cfg/models/v10/yolov10n-CSP-PTB.yaml Cross Stage Partial - Partially Transformer Block 在计算机视觉任务中,Transformer结构因其强大的全局特征提取能力而受到广泛关注。然而,由于Transformer结构的计算复杂度较高,直接将其应用于所有通道会导致显著的计算开销。为了在保证高效特征提取的同时降低计算成本,我们设计了一种混合结构,将输入特征图分为两部分,分别由CNN和Transformer处理,结合了卷积神经网络(CNN)和Transformer机制的模块,旨在增强特征提取的能力。 我们提出了一种名为CSP_PTB(Cross Stage Partial - Partially Transformer Block)的模块,旨在结合CNN和Transformer的优势,通过对输入通道进行部分分配来优化计算效率和特征提取能力。 1. 融合局部和全局特征:多项研究表明,CNN的感受野大小较少,导致其只能提取局部特征,但Transformer的MHSA能够提取全局特征,能够同时利用两者的优势。 2. 保证高效特征提取的同时降低计算成本:为了能引入Transformer结构来提取全局特征又不想大幅度增加计算复杂度,因此提出Partially Transformer Block,只对部分通道使用TransformerBlock。 3. MHSA_CGLU包含Mutil-Head-Self-Attention和[ConvolutionalGLU(TransNext CVPR2024)](https://github.com/DaiShiResearch/TransNeXt),其中Mutil-Head-Self-Attention负责提取全局特征,ConvolutionalGLU用于增强非线性特征表达能力,ConvolutionalGLU相比于传统的FFN,具有更强的性能。 4. 可以根据不同的模型大小和具体的运行情况调节用于Transformer的通道数。 21. GlobalEdgeInformationTransfer 实现版本1:ultralytics/cfg/models/v10/yolov10n-GlobalEdgeInformationTransfer1.yaml 实现版本3:ultralytics/cfg/models/v10/yolov10n-GlobalEdgeInformationTransfer3.yaml 实现版本2:ultralytics/cfg/models/v10/yolov10n-GlobalEdgeInformationTransfer2.yaml 总所周知,物体框的定位非常之依赖物体的边缘信息,但是对于常规的目标检测网络来说,没有任何组件能提高网络对物体边缘信息的关注度,我们需要开发一个能让边缘信息融合到各个尺度所提取的特征中,因此我们提出一个名为GlobalEdgeInformationTransfer(GEIT)的模块,其可以帮助我们把浅层特征中提取到的边缘信息传递到整个backbone上,并与不同尺度的特征进行融合。 1. 由于原始图像中含有大量背景信息,因此从原始图像上直接提取边缘信息传递到整个backbone上会给网络的学习带来噪声,而且浅层的卷积层会帮助我们过滤不必要的背景信息,因此我们选择在网络的浅层开发一个名为MutilScaleEdgeInfoGenetator的模块,其会利用网络的浅层特征层去生成多个尺度的边缘信息特征图并投放到主干的各个尺度中进行融合。 2. 对于下采样方面的选择,我们需要较为谨慎,我们的目标是保留并增强边缘信息,同时进行下采样,选择MaxPool 会更合适。它能够保留局部区域的最强特征,更好地体现边缘信息。因为 AvgPool 更适用于需要平滑或均匀化特征的场景,但在保留细节和边缘信息方面的表现不如 MaxPool。 3. 对于融合部分,ConvEdgeFusion巧妙地结合边缘信息和普通卷积特征,提出了一种新的跨通道特征融合方式。首先,使用conv_channel_fusion进行边缘信息与普通卷积特征的跨通道融合,帮助模型更好地整合不同来源的特征。然后采用conv_3x3_feature_extract进一步提取融合后的特征,以增强模型对局部细节的捕捉能力。最后通过conv_1x1调整输出特征维度。 22. ultralytics/cfg/models/v10/yolov10n-C2f-DIMB.yaml 自研模块DynamicInceptionDWConv2d.(详细请看项目内配置文件.md) 23. ultralytics/cfg/models/v10/yolov10n-HAFB-1.yaml 自研Hierarchical Attention Fusion Block.(详细请看项目内配置文件.md) 24. ultralytics/cfg/models/v10/yolov10n-HAFB-2.yaml HAFB另外一种使用方法. 25. ultralytics/cfg/models/v10/yolov10n-MutilBackbone-HAFB.yaml yolov10n-MutilBackbone-DAF.yaml基础上用上HAFB. ### BackBone系列 1. ultralytics/cfg/models/v10/yolov10n-efficientViT.yaml (CVPR2023)efficientViT替换yolov10主干. 2. ultralytics/cfg/models/v10/yolov10n-fasternet.yaml (CVPR2023)fasternet替换yolov10主干. 3. ultralytics/cfg/models/v10/yolov10n-timm.yaml 使用timm支持的主干网络替换yolov10主干. 4. ultralytics/cfg/models/v10/yolov10n-convnextv2.yaml 使用convnextv2网络替换yolov10主干. 5. ultralytics/cfg/models/v10/yolov10n-EfficientFormerV2.yaml 使用EfficientFormerV2网络替换yolov10主干.(需要看[常见错误和解决方案的第五点](#a)) 6. ultralytics/cfg/models/v10/yolov10n-vanillanet.yaml vanillanet替换yolov10主干. 7. ultralytics/cfg/models/v10/yolov10n-LSKNet.yaml LSKNet(2023旋转目标检测SOTA的主干)替换yolov10主干. 8. ultralytics/cfg/models/v10/yolov10n-swintransformer.yaml SwinTransformer-Tiny替换yolov10主干. 9. ultralytics/cfg/models/v10/yolov10n-repvit.yaml [CVPR2024 RepViT](https://github.com/THU-MIG/RepViT/tree/main)替换yolov10主干. 10. ultralytics/cfg/models/v10/yolov10n-CSwinTransformer.yaml 使用[CSWin-Transformer(CVPR2022)](https://github.com/microsoft/CSWin-Transformer/tree/main)替换yolov10主干.(需要看[常见错误和解决方案的第五点](#a)) 11. ultralytics/cfg/models/v10/yolov10n-HGNetV2.yaml 使用HGNetV2作为YOLOV10的backbone. 12. ultralytics/cfg/models/v10/yolov10n-unireplknet.yaml 使用[UniRepLKNet](https://github.com/AILab-CVC/UniRepLKNet/tree/main)替换yolov10主干. 13. ultralytics/cfg/models/v10/yolov10n-TransNeXt.yaml 使用[TransNeXt](https://github.com/DaiShiResearch/TransNeXt)改进yolov10的backbone.(需要看[常见错误和解决方案的第五点](#a)) 14. ultralytics/cfg/models/v10/yolov10n-rmt.yaml 使用[CVPR2024 RMT](https://arxiv.org/abs/2309.11523)改进yolov10的主干. 15. ultralytics/cfg/models/v10/yolov10n-pkinet.yaml 使用[CVPR2024 PKINet](https://github.com/PKINet/PKINet)改进backbone.(需要安装mmcv和mmengine) 16. ultralytics/cfg/models/v10/yolov10n-mobilenetv4.yaml 使用[MobileNetV4](https://github.com/jaiwei98/MobileNetV4-pytorch/tree/main)改进yolov10的backbone. 17. ultralytics/cfg/models/v10/yolov10n-starnet.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)改进yolov10-backbone. 18. ultralytics/cfg/models/v10/yolov10n-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOut替换BackBone. 19. ultralytics/cfg/models/v10/yolov10n-lsnet.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)中的lsnet替换yolov10的backbone. 20. ultralytics/cfg/models/v10/yolov10n-overlock.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的overlock-backbone替换backbone. ### SPPF系列 1. ultralytics/cfg/models/v10/yolov10n-FocalModulation.yaml 使用[Focal Modulation](https://github.com/microsoft/FocalNet)替换SPPF. 2. ultralytics/cfg/models/v10/yolov10n-SPPF-LSKA.yaml 使用[LSKA](https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention)注意力机制改进SPPF,增强多尺度特征提取能力. 3. ultralytics/cfg/models/v10/yolov10n-AIFIRep.yaml 使用[ICML-2024 SLAB](https://github.com/xinghaochen/SLAB)与AIFI改进yolov10. ### Neck系列 1. ultralytics/cfg/models/v10/yolov10n-bifpn.yaml 添加BIFPN到yolov10中. 其中BIFPN中有三个可选参数: 1. Fusion 其中BIFPN中的Fusion模块支持五种: weight, adaptive, concat, bifpn(default), SDI 其中weight, adaptive, concat出自[paper链接-Figure 3](https://openreview.net/pdf?id=q2ZaVU6bEsT), SDI出自[U-NetV2](https://github.com/yaoppeng/U-Net_v2) 2. node_mode 其中支持这些[结构](#b) 3. head_channel BIFPN中的通道数,默认设置为256. 2. ultralytics/cfg/models/v10/yolov10n-slimneck.yaml 使用[VoVGSCSP\VoVGSCSPC和GSConv](https://github.com/AlanLi1997/slim-neck-by-gsconv)替换yolov10 neck中的C2f和Conv. 3. ultralytics/cfg/models/v10/yolov10n-goldyolo.yaml 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块. 4. ultralytics/cfg/models/v10/yolov10n-MAFPN.yaml 使用[MAF-YOLO](https://arxiv.org/pdf/2407.04381)的MAFPN改进Neck. 5. ultralytics/cfg/models/v10/yolov10n-ASF.yaml 使用[ASF-YOLO](https://github.com/mkang315/ASF-YOLO)中的Attentional Scale Sequence Fusion改进yolov10. 6. Cross-Layer Feature Pyramid Transformer. P345:ultralytics/cfg/models/v10/yolov10n-CFPT.yaml P2345:ultralytics/cfg/models/v10/yolov10n-CFPT-P2345.yaml P3456:ultralytics/cfg/models/v10/yolov10n-CFPT-P3456.yaml P23456:ultralytics/cfg/models/v10/yolov10n-CFPT-P23456.yaml 使用[CFPT](https://github.com/duzw9311/CFPT/tree/main)改进neck. 7. ultralytics/cfg/models/v10/yolov10n-RCSOSA.yaml 使用[RCS-YOLO](https://github.com/mkang315/RCS-YOLO/tree/main)中的RCSOSA替换C2f. 8. ultralytics/cfg/models/v10/yolov10n-GFPN.yaml 使用[DAMO-YOLO](https://github.com/tinyvision/DAMO-YOLO)中的RepGFPN改进Neck. 9. ultralytics/cfg/models/v10/yolov10n-EfficientRepBiPAN.yaml 使用[YOLOV6](https://github.com/meituan/YOLOv6/tree/main)中的EfficientRepBiPAN改进Neck. 10. ultralytics/cfg/models/v10/yolov10n-HSFPN.yaml 使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进yolov10的neck. 11. ultralytics/cfg/models/v10/yolov10n-hyper.yaml 使用[Hyper-YOLO](https://www.arxiv.org/pdf/2408.04804)中的Hypergraph Computation in Semantic Space改进yolov10. 12. ultralytics/cfg/models/v10/yolov10n-msga.yaml 使用[MSA^2 Net](https://github.com/xmindflow/MSA-2Net)中的Multi-Scale Adaptive Spatial Attention Gate改进yolov10-neck. 13. ultralytics/cfg/models/v10/yolov10n-CGAFusion.yaml 使用[DEA-Net](https://github.com/cecret3350/DEA-Net)中的content-guided attention fusion改进yolov10-neck. 14. ultralytics/cfg/models/v10/yolov10n-WFU.yaml 使用[ACMMM2024 WFEN](https://github.com/PRIS-CV/WFEN)中的Wavelet Feature Upgrade改进yolov10-neck. 15. ultralytics/cfg/models/v10/yolov10n-fsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention改进yolov10. 16. ultralytics/cfg/models/v10/yolov10n-mscafsa.yaml 使用[BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation](https://github.com/nkicsl/SF-UNet)的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention改进yolov10-neck. 17. ultralytics/cfg/models/v10/yolov10n-MFM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的MFM改进neck. 18. ultralytics/cfg/models/v10/yolov10n-GDSAFusion.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的GDSAFusion改进neck. 19. ultralytics/cfg/models/v10/yolov10n-RFPN.yaml 使用[ECCV2024 rethinking-fpn](https://github.com/AlanLi1997/rethinking-fpn)的SNI和GSConvE改进YOLOV10n-neck. 20. ultralytics/cfg/models/v10/yolov10n-PST.yaml 使用[Pyramid Sparse Transformer](https://arxiv.org/abs/2505.12772)中的Pyramid Sparse Transformer改进neck. 21. ultralytics/cfg/models/v10/yolov10n-HS-FPN.yaml 使用[AAAI2025 HS-FPN](https://github.com/ShiZican/HS-FPN/tree/main)中的HFP和SDP改进yolo-neck. 22. ultralytics/cfg/models/v10/yolov10n-LCA.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的LCA改进yolov10-neck. 23. ultralytics/cfg/models/v10/yolov10n-HFFE.yaml 使用[TGRS2025 HAFNet](https://ieeexplore.ieee.org/document/11154006)中的HFFE改进yolov10-neck. ### Head系列 1. ultralytics/cfg/models/v10/yolov10n-dyhead.yaml 添加基于注意力机制的目标检测头到yolov10中. 2. ultralytics/cfg/models/v10/yolov10n-LQE.yaml Localization Quality Estimation Head-NMSFree,Localization Quality Estimation此模块出自[GFocalV2](https://arxiv.org/abs/2011.12885). ### Label Assign系列 ### PostProcess系列 ### 上下采样算子 1. ultralytics/cfg/models/v10/yolov10n-ContextGuidedDown.yaml 使用[CGNet](https://github.com/wutianyiRosun/CGNet/tree/master)中的Light-weight Context Guided DownSample进行下采样. 2. ultralytics/cfg/models/v10/yolov10n-SPDConv.yaml 使用[SPDConv](https://github.com/LabSAINT/SPD-Conv/tree/main)进行下采样. 3. ultralytics/cfg/models/v10/yolov10n-dysample.yaml 使用[ICCV2023 DySample](https://arxiv.org/abs/2308.15085)改进yolov10-neck中的上采样. 4. ultralytics/cfg/models/v10/yolov10n-CARAFE.yaml 使用[ICCV2019 CARAFE](https://arxiv.org/abs/1905.02188)改进yolov10-neck中的上采样. 5. ultralytics/cfg/models/v10/yolov10n-HWD.yaml 使用[Haar wavelet downsampling](https://www.sciencedirect.com/science/article/abs/pii/S0031320323005174)改进yolov8的下采样.(请关闭AMP情况下使用) 6. ultralytics/cfg/models/v8=10/yolov10n-v7DS.yaml 使用[YOLOV7 CVPR2023](https://arxiv.org/abs/2207.02696)的下采样结构改进YOLOV10中的下采样. 7. ultralytics/cfg/models/v10/yolov10n-ADown.yaml 使用[YOLOV9](https://github.com/WongKinYiu/yolov9)的下采样结构改进YOLOV10中的下采样. 8. ultralytics/cfg/models/v10/yolov10n-SRFD.yaml 使用[A Robust Feature Downsampling Module for Remote Sensing Visual Tasks](https://ieeexplore.ieee.org/document/10142024)改进yolov10的下采样. 9. ultralytics/cfg/models/v10/yolov10n-WaveletPool.yaml 使用[Wavelet Pooling](https://openreview.net/forum?id=rkhlb8lCZ)改进YOLOV10的上采样和下采样。 10. ultralytics/cfg/models/v10/yolov10n-LDConv.yaml 使用[LDConv](https://github.com/CV-ZhangXin/LDConv/tree/main)改进下采样. 11. ultralytics/cfg/models/v10/yolov10n-PSConv.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Pinwheel-shaped Convolution改进yolov10. 12. ultralytics/cfg/models/v10/yolov10n-EUCB.yaml 使用[CVPR2024 EMCAD](https://github.com/SLDGroup/EMCAD)中的EUCB改进yolov10的上采样. 13. ultralytics/cfg/models/v10/yolov10n-LoGStem.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LoGStem改进Stem(第一第二层卷积). 14. ultralytics/cfg/models/v10/yolov10n-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进Conv. 15. ultralytics/cfg/models/v10/yolov10n-RepStem.yaml 使用[ICCV2023 FastVit](https://arxiv.org/pdf/2303.14189)中的RepStem改进yolov10下采样. 16. ultralytics/cfg/models/v10/yolov10n-C2f-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进C2f. ### C2f系列 1. ultralytics/cfg/models/v10/yolov10n-C2f-WTConv.yaml 使用[ECCV2024 Wavelet Convolutions for Large Receptive Fields](https://github.com/BGU-CS-VIL/WTConv)中的WTConv改进C2f-BottleNeck. 2. ultralytics/cfg/models/v10/yolov10n-attention.yaml 可以看项目视频-如何在yaml配置文件中添加注意力层 多种注意力机制在yolov10中的使用. [多种注意力机制github地址](https://github.com/z1069614715/objectdetection_script/tree/master/cv-attention) 目前内部整合的注意力可看[链接](#c) 3. ultralytics/cfg/models/v10/yolov10n-C2f-FMB.yaml 使用[ECCV2024 SMFANet](https://github.com/Zheng-MJ/SMFANet/tree/main)的Feature Modulation block改进C2f. 4. ultralytics/cfg/models/v10/yolov10n-C2f-Faster.yaml 使用C2f-Faster替换C2f.(使用FasterNet中的FasterBlock替换C2f中的Bottleneck) 5. ultralytics/cfg/models/v10/yolov10n-C2f-ODConv.yaml 使用C2f-ODConv替换C2f.(使用ODConv替换C2f中的Bottleneck中的Conv) 6. ultralytics/cfg/models/v10/yolov10n-C2f-Faster-EMA.yaml 使用C2f-Faster-EMA替换C2f.(C2f-Faster-EMA推荐可以放在主干上,Neck和head部分可以选择C2f-Faster) 7. ultralytics/cfg/models/v10/yolov10n-C2f-DBB.yaml 使用C2f-DBB替换C2f.(使用DiverseBranchBlock替换C2f中的Bottleneck中的Conv) 8. ultralytics/cfg/models/v10/yolov10n-C2f-CloAtt.yaml 使用C2f-CloAtt替换C2f.(使用CloFormer中的具有全局和局部特征的注意力机制添加到C2f中的Bottleneck中)(需要看[常见错误和解决方案的第五点](#a)) 9. ultralytics/cfg/models/v10/yolov10n-C2f-gConv.yaml 使用[Rethinking Performance Gains in Image Dehazing Networks](https://arxiv.org/abs/2209.11448)的gConvblock改进C2f. 10. ultralytics/cfg/models/v10/yolov10n-C2f-SCConv.yaml SCConv(CVPR2020 http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf)与C2f融合. 11. ultralytics/cfg/models/v10/yolov10n-C2f-SCcConv.yaml ScConv(CVPR2023 https://openaccess.thecvf.com/content/CVPR2023/papers/Li_SCConv_Spatial_and_Channel_Reconstruction_Convolution_for_Feature_Redundancy_CVPR_2023_paper.pdf)与C2f融合. (取名为SCcConv的原因是在windows下命名是不区分大小写的) 12. ultralytics/cfg/models/v10/yolov10n-KernelWarehouse.yaml 使用[Towards Parameter-Efficient Dynamic Convolution](https://github.com/OSVAI/KernelWarehouse)添加到yolov10中. 使用此模块需要注意,在epoch0-20的时候精度会非常低,过了20epoch会正常. 13. ultralytics/cfg/models/v10/yolov10n-C2f-DySnakeConv.yaml [DySnakeConv](https://github.com/YaoleiQi/DSCNet)与C2f融合. 14. ultralytics/cfg/models/v10/yolov10n-C2f-WDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的WDBB改进c2f. 15. ultralytics/cfg/models/v10/yolov10n-C2f-DeepDBB.yaml 使用[YOLO-MIF](https://github.com/wandahangFY/YOLO-MIF)中的DeepDBB改进c2f. 16. ultralytics/cfg/models/v10/yolov10n-C2f-AdditiveBlock.yaml 使用[CAS-ViT](https://github.com/Tianfang-Zhang/CAS-ViT)中的AdditiveBlock改进c2f. 17. ultralytics/cfg/models/v10/yolov10n-C2f-MogaBlock.yaml 使用[MogaNet ICLR2024](https://github.com/Westlake-AI/MogaNet)中的MogaBlock改进C2f. 18. ultralytics/cfg/models/v10/yolov10n-C2f-IdentityFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的IdentityFormer改进c2f. 19. ultralytics/cfg/models/v10/yolov10n-C2f-RandomMixing.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的RandomMixingFormer改进c2f.(需要看[常见错误和解决方案的第五点](#a)) 20. ultralytics/cfg/models/v10/yolov10n-C2f-PoolingFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的PoolingFormer改进c2f. 21. ultralytics/cfg/models/v10/yolov10n-C2f-ConvFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的ConvFormer改进c2f. 22. ultralytics/cfg/models/v10/yolov10n-C2f-CaFormer.yaml 使用[Metaformer TPAMI2024](https://github.com/sail-sg/metaformer)中的CaFormer改进c2f. 23. ultralytics/cfg/models/v10/yolov10n-C2f-FFCM.yaml 使用[Efficient Frequency-Domain Image Deraining with Contrastive Regularization ECCV2024](https://github.com/deng-ai-lab/FADformer)中的Fused_Fourier_Conv_Mixer改进C2f. 25. ultralytics/cfg/models/v10/yolov10n-C2f-SFHF.yaml 使用[SFHformer ECCV2024](https://github.com/deng-ai-lab/SFHformer)中的block改进C2f. 26. ultralytics/cfg/models/v10/yolov10n-C2f-MSM.yaml 使用[Revitalizing Convolutional Network for Image Restoration TPAMI2024](https://zhuanlan.zhihu.com/p/720777160)中的MSM改进C2f. 27. ultralytics/cfg/models/v10/yolov10n-C2f-iRMB.yaml 使用[EMO ICCV2023](https://github.com/zhangzjn/EMO)中的iRMB改进C2f. 30. ultralytics/cfg/models/v10/yolov10n-C2f-RAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的HDRAB(hybrid dilated residual attention block)改进C2f. 31. ultralytics/cfg/models/v10/yolov10n-C2f-HDRAB.yaml 使用[Pattern Recognition 2024|DRANet](https://github.com/WenCongWu/DRANet)中的RAB( residual attention block)改进C2f. 32. ultralytics/cfg/models/v10/yolov10n-C2f-LFE.yaml 使用[Efficient Long-Range Attention Network for Image Super-resolution ECCV2022](https://github.com/xindongzhang/ELAN)中的Local feature extraction改进C2f. 32. ultralytics/cfg/models/v10/yolov10n-C2f-SFA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention-SFA改进C2f. 33. ultralytics/cfg/models/v10/yolov10n-C2f-CTA.yaml 使用[FreqFormer](https://github.com/JPWang-CS/FreqFormer)的Frequency-aware Cascade Attention-CTA改进C2f. 34. ultralytics/cfg/models/v10/yolov10n-C2f-CAMixer.yaml 使用[CAMixerSR CVPR2024](https://github.com/icandle/CAMixerSR)中的CAMixer改进C2f. 35. ultralytics/cfg/models/v10/yolov10n-MAN.yaml 使用[Hyper-YOLO TPAMI2025](https://www.arxiv.org/pdf/2408.04804)中的Mixed Aggregation Network改进yolov10. 36. ultralytics/cfg/models/v10/yolov10n-C2f-HFERB.yaml 使用[ICCV2023 CRAFT-SR](https://github.com/AVC2-UESTC/CRAFT-SR)中的high-frequency enhancement residual block改进C2f. 37. ultralytics/cfg/models/v10/yolov10n-C2f-DTAB.yaml 使用[AAAI2025 TBSN](https://github.com/nagejacob/TBSN)中的DTAB改进C2f. 38. ultralytics/cfg/models/v10/yolov10n-C2f-JDPM.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的joint domain perception module改进C2f. 39. ultralytics/cfg/models/v10/yolov10n-C2f-ETB.yaml 使用[ECCV2024 FSEL](https://github.com/CSYSI/FSEL)中的entanglement transformer block改进C2f. 40. ultralytics/cfg/models/v10/yolov10n-C2f-AP.yaml 使用[AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection](https://github.com/JN-Yang/PConv-SDloss-Data)中的Asymmetric Padding bottleneck改进C2f. 41. ultralytics/cfg/models/v10/yolov10n-C2f-Kat.yaml 使用[ICLR2025 Kolmogorov-Arnold Transformer](https://github.com/Adamdad/kat)中的KAT改进C2f. 42. ultralytics/cfg/models/v10/yolov10n-C2f-GlobalFilter.yaml 使用[T-PAMI Global Filter Networks for Image Classification](https://github.com/raoyongming/GFNet)中的GlobalFilterBlock和[TransNeXt CVPR2024](https://github.com/DaiShiResearch/TransNeXt)中的Convolutional GLU改进C2f. 43. ultralytics/cfg/models/v10/yolov10n-C2f-DynamicFilter.yaml 使用[AAAI2024 FFT-Based Dynamic Token Mixer for Vision](https://github.com/okojoalg/dfformer)中的DynamicFilter改进C2f. 44. ultralytics/cfg/models/v10/yolov10n-RepHMS.yaml 使用[MHAF-YOLO](https://github.com/yang-0201/MHAF-YOLO)中的RepHMS改进yolov10. 45. ultralytics/cfg/models/v10/yolov10n-C2f-SAVSS.yaml 使用[CVPR2025 SCSegamba](https://github.com/Karl1109/SCSegamba)中的Structure-Aware Scanning Strategy改进C2f. 46. ultralytics/cfg/models/v10/yolov10n-C2f-mambaout.yaml 使用[CVPR2025 MambaOut](https://github.com/yuweihao/MambaOut)中的MambaOutBlock改进C2f. 47. ultralytics/cfg/models/v10/yolov10n-C2f-EfficientVIM.yaml 使用[CVPR2025 EfficientViM](https://github.com/mlvlab/EfficientViM)中的EfficientViMBlock改进C2f. 48. ultralytics/cfg/models/v10/yolov10n-C2f-LEGM.yaml 使用[CVPR2024 DCMPNet](https://github.com/zhoushen1/DCMPNet)中的LEGM改进C2f. 49. ultralytics/cfg/models/v10/yolov10n-C2f-RCB.yaml 使用[CVPR2025 OverLock](https://arxiv.org/pdf/2502.20087)中的RepConvBlock改进C2f. 50. ultralytics/cfg/models/v10/yolov10n-C2f-LFEM.yaml 使用[LEGNet](https://github.com/lwCVer/LEGNet)中的LFEModule改进C2f. 51. ultralytics/cfg/models/v10/yolov10n-C2f-LSBlock.yaml 使用[CVPR2025 LSNet](https://github.com/THU-MIG/lsnet)中的LSBlock改进C2f. 52. ultralytics/cfg/models/v10/yolov10n-C2f-TransMamba.yaml 使用[TransMamba](https://github.com/sunshangquan/TransMamba)的TransMamba改进C2f 53. ultralytics/cfg/models/v10/yolov10n-C2f-EVS.yaml 使用[CVPR2025 EVSSM](https://github.com/kkkls/EVSSM)中的EVS改进C2f.(编译教程请看:20240219版本更新说明) 54. ultralytics/cfg/models/v10/yolov10n-C2f-EBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的EBlock改进C2f. 55. ultralytics/cfg/models/v10/yolov10n-C2f-DBlock.yaml 使用[CVPR2025 DarkIR](https://github.com/cidautai/DarkIR)中的DBlock改进C2f. 56. ultralytics/cfg/models/v10/yolov10n-C2f-SFSConv.yaml 使用[CVPR2024 SFSConv](https://github.com/like413/SFS-Conv)的SFSConv改进C2f. 57. ultralytics/cfg/models/v10/yolov10n-FCM.yaml 使用[AAAI2025 FBRT-YOLO](https://github.com/galaxy-oss/FCM)的模块改进yolov10. 58. ultralytics/cfg/models/v10/yolov10n-C2f-GroupMamba.yaml 使用[CVPR2025 GroupMamba](https://github.com/Amshaker/GroupMamba)中的GroupMambaBlock改进C2f. 59. ultralytics/cfg/models/v10/yolov10n-C2f-MambaVision.yaml 使用[CVPR2025 MambaVision](https://github.com/NVlabs/MambaVision)中的MambaVision改进C2f. 60. ultralytics/cfg/models/v10/yolov10n-C2f-FourierConv.yaml 使用[MIA2025 Fourier Convolution Block with global receptive field for MRI reconstruction](https://www.sciencedirect.com/science/article/abs/pii/S1361841524002743)中的FourierConv改进C2f. 61. ultralytics/cfg/models/v10/yolov10n-C2f-GLVSS.yaml 使用[TGRS2025 UMFormer](https://github.com/takeyoutime/UMFormer)中的GLVSS改进C2f. 62. ultralytics/cfg/models/v10/yolov10n-C2f-ESC.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ESC改进C2f. 63. ultralytics/cfg/models/v10/yolov10n-C2f-ConvAttn.yaml 使用[ICCV2025 ESC: Emulating Self-attention with Convolution for Efficient Image Super-Resolution](https://github.com/dslisleedh/ESC)中的ConvAttn改进C2f. 64. ultralytics/cfg/models/v10/yolov10n-C2f-UniConv.yaml 使用[ICCV2025 UniConvBlock](https://github.com/ai-paperwithcode/UniConvNet)中的UniConvBlock改进C2f. 65. ultralytics/cfg/models/v10/yolov10n-C2f-GCConv.yaml 使用[CVPR2025 Golden Cudgel Network](https://github.com/gyyang23/GCNet)中的GCConv改进C2f. 66. ultralytics/cfg/models/v10/yolov10n-C2f-CFBlock.yaml 使用[AAAI2024 SCTNet](https://arxiv.org/pdf/2312.17071)中的CFBlock改进C2f. 67. ultralytics/cfg/models/v10/yolov10n-C2f-CSSC.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CSSC改进C2f. 68. ultralytics/cfg/models/v10/yolov10n-C2f-CNCM.yaml 使用[TGRS2025 ASCNet](https://ieeexplore.ieee.org/document/10855453)中的CNCM改进C2f. 69. ultralytics/cfg/models/v10/yolov10n-C2f-HFRB.yaml 使用[ICCV2025 HFRB](https://arxiv.org/pdf/2507.10689)中的HFRB改进C2f. 70. ultralytics/cfg/models/v10/yolov10n-C2f-EVA.yaml 使用[ICIP2025 BEVANET](https://arxiv.org/pdf/2508.07300)中的EVA改进C2f. 71. ultralytics/cfg/models/v10/yolov10n-C2f-RMBC.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv改进C2f. 72. ultralytics/cfg/models/v10/yolov10n-C2f-RMBC-LA.yaml 使用[PlainUSR](https://arxiv.org/pdf/2409.13435)中的RepMBConv和Local Importance-based Attention改进C2f. 73. ultralytics/cfg/models/v10/yolov10n-C2f-IEL.yaml 使用[CVPR2025 HVI](https://arxiv.org/pdf/2502.20272)中的IEL改进C2f. ### PSA系列 1. ultralytics/cfg/models/v10/yolov10n-PTSSA.yaml 使用[Token Statistics Transformer](https://github.com/RobinWu218/ToST)中的Token Statistics Self-Attention改进PSA. 2. ultralytics/cfg/models/v10/yolov10n-ASSR.yaml 使用[CVPR2025 MambaIR](https://github.com/csguoh/MambaIR)中的Attentive State Space Group改进yolov10. ### 组合系列 1. ultralytics/cfg/models/v10/yolov10n-starnet-bifpn.yaml 使用[StarNet CVPR2024](https://github.com/ma-xu/Rewrite-the-Stars/tree/main)和bifpn改进yolov10. 2. ultralytics/cfg/models/v10/yolov10n-ELA-HSFPN-TADDH.yaml 使用[Efficient Local Attention](https://arxiv.org/abs/2403.01123)改进HSFPN,使用自研动态动态对齐检测头改进Head. # Mamba-YOLO 1. [Mamba-YOLO](https://github.com/HZAI-ZJNU/Mamba-YOLO) 集成Mamba-YOLO.(需要编译请看百度云视频-20240619版本更新说明) ultralytics/cfg/models/mamba-yolo/Mamba-YOLO-T.yaml ultralytics/cfg/models/mamba-yolo/Mamba-YOLO-B.yaml ultralytics/cfg/models/mamba-yolo/Mamba-YOLO-L.yaml ultralytics/cfg/models/mamba-yolo/yolo-mamba-seg.yaml # Hyper-YOLO 1. ultralytics/cfg/models/hyper-yolo/hyper-yolo.yaml 2. ultralytics/cfg/models/hyper-yolo/hyper-yolot.yaml 3. ultralytics/cfg/models/hyper-yolo/hyper-yolo-seg.yaml # 注意力系列 1. EMA 2. SimAM 3. SpatialGroupEnhance 4. BiLevelRoutingAttention, BiLevelRoutingAttention_nchw 5. TripletAttention 6. CoordAtt 7. CBAM 8. BAMBlock 9. EfficientAttention(CloFormer中的注意力) 10. LSKBlock 11. SEAttention 12. CPCA 13. deformable_LKA 14. EffectiveSEModule 15. LSKA 16. SegNext_Attention 17. DAttention(Vision Transformer with Deformable Attention CVPR2022) 18. FocusedLinearAttention(ICCV2023) 19. MLCA 20. TransNeXt_AggregatedAttention 21. LocalWindowAttention(EfficientViT中的CascadedGroupAttention注意力) 22. Efficient Local Attention[Efficient Local Attention](https://arxiv.org/abs/2403.01123) 23. CAA(CVPR2024 PKINet中的注意力) 24. CAFM 25. AFGCAttention[Neural Networks ECCV2024](https://www.sciencedirect.com/science/article/abs/pii/S0893608024002387) # Loss系列 1. SlideLoss,EMASlideLoss.(可动态调节正负样本的系数,让模型更加注重难分类,错误分类的样本上) 2. IoU,GIoU,DIoU,CIoU,EIoU,SIoU,MPDIoU,ShapeIoU. 3. Inner-IoU,Inner-GIoU,Inner-DIoU,Inner-CIoU,Inner-EIoU,Inner-SIoU,Inner-ShapeIoU. 4. Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 5. Inner-Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 6. FocalLoss,VarifocalLoss,QualityfocalLoss 7. Focaler-IoU系列(IoU,GIoU,DIoU,CIoU,EIoU,SIoU,WIoU,MPDIoU,ShapeIoU) 8. Powerful-IoU,Powerful-IoUV2,Inner-Powerful-IoU,Inner-Powerful-IoUV2,Focaler-Powerful-IoU,Focaler-Powerful-IoUV2,Wise-Powerful-IoU(v1,v2,v3),Wise-Powerful-IoUV2(v1,v2,v3)[论文链接](https://www.sciencedirect.com/science/article/abs/pii/S0893608023006640) 9. Normalized Gaussian Wasserstein Distance. 10. Gaussian Combined Distance. # 更新公告 - **20230620-yolov8-v1.1** 1. 增加EMA,C2f-Faster-EMA. 2. val.py增加batch选择. 3. train.py增加resume断点续训. - **20230625-yolov8-v1.2** 1. 使用说明和视频增加断点续训教程. 2. 增加 使用C2f-DBB替换C2f.(使用DiverseBranchBlock替换C2f中的Bottleneck中的Conv) C2f-DBB同样可以用在bifpn中的node. 3. 使用说明中增加常见错误以及解决方案. - **20230627-yolov8-v1.3** 1. 增加Adaptive Training Sample Selection匹配策略. 2. val.py增加save_txt参数. 3. 更新使用教程. - **20230701-yolov8-v1.4** 1. val.py中增加imgsz参数,可以自定义val时候的图片尺寸,默认为640. 2. 增加plot_result.py,用于绘制对比曲线图,详细请看使用说明13点. 3. 支持计算COCO评价指标.详细请看使用说明12点. 4. 增加yolov8-slimneck.其中VoVGSCSP\VoVGSCSPC支持在bifpn中使用,支持GSConv的替换. - **20230703-yolov8-v1.5** 1. 修正计算gflops. 2. 增加YOLOV5-AnchorFree改进,详细可看使用教程.md 3. 增加yolov8-attention.yaml,并附带视频如何在yaml中添加注意力层 4. 更新train.py --info参数的功能,增加打印每一层的参数,增加模型融合前后的层数,参数量,计算量对比。 - **20230705-yolov8-v1.6** 1. yolov5和yolov8 支持 Asymptotic Feature Pyramid Network. - **20230714-yolov8-v1.7** 1. 把添加的所有模块全部转移到ultralytics/nn/extra_modules,以便后面进行同步代码。 2. 增加yolov5-bifpn。 3. 修正ultralytics/models/v8/yolov8-efficientViT.yaml,经粉丝反映,EfficientViT存在同名论文,本次更新的EfficientViT更适合目标检测,之前的efficientViT的原文是在语义分割上进行提出的。 4. 更新使用教程。 5. 更新import逻辑,现在不需要安装mmcv也可以进行使用,但是没有安装mmcv的使用dyhead会进行报错,降低上手难度。 - **20230717-yolov8-v1.8** 1. 修正vanillanet主干进行fuse后没法计算GFLOPs的bug. 2. 添加yolov8-C2f-CloAtt,yolov5-C3-CloAtt. 3. 添加yolov8-vanillanet.yaml. - **20230723-yolov8-v1.9** 1. 利用(ICLR2023)Reversible Column Networks对yolov5,yolov8的结构进行重设计. 2. 支持旋转目标检测2023SOTA的LSKNet主干. 3. 支持旋转目标检测2023SOTA的LSKNet主干中的LSKBlock注意力机制. 4. 更新使用教程中的常见错误. 5. 使用教程中增加常见疑问. - **20230730-yolov8-v1.10** 1. 增加yolov8-C2f-SCConv,yolov5-C3-SCConv.(CVPR 2020 http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf) 2. 增加yolov8-C2f-ScConv,yolov5-C3-ScConv.(CVPR 2023 https://openaccess.thecvf.com/content/CVPR2023/papers/Li_SCConv_Spatial_and_Channel_Reconstruction_Convolution_for_Feature_Redundancy_CVPR_2023_paper.pdf) 3. 更新使用教程. 4. 更新视频百度云链接,增加SCConv和ScConv的使用教程. - **20230730-yolov8-v1.11** 1. yolov8-C2f-ScConv,yolov5-C3-ScConv分别更名为yolov8-C2f-SCcConv,yolov5-C3-SCcConv,因为在windows下命名不会区分大小写,导致解压的时候会出现覆盖请求. 2. 支持MPDiou,具体修改方法请看使用教程. - **20230802-yolov8-v1.11.1** 1. 去除dataloader中的drop_last(ultralytics/yolo/data/build.py, build_dataloader func). 2. 修正MPDiou. - **20230806-yolov8-v1.12** 1. 添加全新自研模块(Light Adaptive-weight downsampling),具体可看使用教程. - **20230808-yolov8-v1.13** 1. 添加全新自研模块(EMSC, EMSCP),具体可看使用教程. 2. 添加RSC-YOLO中的RCSOSA到yolov5和yolov8中. 3. 更新使用教程. - **20230824-yolov8-v1.14** 1. 支持SlideLoss和EMASlideLoss(利用Exponential Moving Average优化mean iou,可当自研创新模块),使用方式具体看使用教程. 2. 支持KernelWarehouse:Towards Parameter-Efficient Dynamic Convolution(2023最新发布的动态卷积). 3. 支持最新可变形卷积-Dynamic Snake Convolution. 4. 支持Normalized Gaussian Wasserstein Distance(NWD). 5. 增加CPCANet中的CPCA注意力机制. 6. 更新使用教程. - **20230830-yolov8-v1.15** 1. 对检测头进行重设计,支持10种(参数量和计算量更低的)检测头,详细请看使用教程. - **20230904-yolov8-v1.16** 1. 支持DCNV2,DCNV3.详细请看项目百度云视频. 2. 使用DCNV3改进DyHead.(ultralytics/models/v5/yolov5-dyhead-DCNV3.yaml,ultralytics/models/v8/yolov8-dyhead-DCNV3.yaml) 3. 根据YOLOV7-AUX辅助训练头思想,改进YOLOV8,增加辅助训练头,训练时候参与训练,检测时候去掉.(ultralytics/models/v5/yolov5-AuxHead.yaml, ultralytics/models/v8/yolov8-AuxHead.yaml) 4. 增加C3-Faster(ultralytics/models/v5/yolov5-C3-Faster.yaml). 5. 增加C3-ODConv(ultralytics/models/v5/yolov5-C3-ODConv.yaml). 6. 增加C3-Faster-EMA(ultralytics/models/v5/yolov5-C3-Faster-EMA.yaml). 7. 更新使用教程. - **20230909-yolov8-v1.17** 1. 优化辅助训练头部分代码. 2. 修复多卡训练中的一些bug. 3. 更新使用教程.(百度云视频中增加关于C3-XXX和C2f-XXX移植到官方yolov5上的讲解) 4. 支持TAL标签分配策略中使用NWD(具体可看使用教程). - **20230915-yolov8-v1.18** 1. 新增Online Convolutional Re-parameterization (CVPR2022).(超越DBB和RepVGG) (C3-OREPA,C3-REPVGGOREPA,C2f-OREPA,C2f-REPVGGOREPA) 2. 新增FocalModulation. 3. 支持RepViT和SwinTransformer-Tiny主干. 4. 利用OREPA优化自研模块(EMSC,EMSCP). 5. 更新使用教程和百度云视频. - **20230916-yolov8-v1.19** 1. 去除OREPA_1x1,该结构会让模型无法收敛或者NAN. 2. 新增yolov8-fasternet-bifpn和yolov5-fasternet-bifpn. 3. 更新使用教程和百度云视频.(更新OREPA的视频和增加如何看懂代码结构-以C2f-Faster-EMA为例). - **20230919-yolov8-v1.19.1** 1. 修复C2f-ODConv在20epochs后精度异常问题. 2. 修复BAM注意力机制中的padding问题. 3. 修复EfficientAttention(CloFormer中的注意力)注意力机制不能在配置文件添加的问题. 4. 去除C2f-EMSP-OREPA,C2f-EMSCP-OREPA,C3-EMSP-OREPA,C3-EMSCP-OREPA,这部分不稳定,容易出现NAN. 5. 群公告中增加使用前必看的百度云视频链接. - **20230924-yolov8-v1.20** 1. 增加自研注意力机制MPCA(基于CVPR2021 CA注意力机制).详细可看百度云视频. 2. 使用自研注意力机制MPCA强化DCNV2中的offset和mask生成.详细可看百度云视频和使用教程. 3. 把timm配置文件的预训练权重参数改为False,也即是默认不下载和使用预训练权重. 4. 利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进特征融合模块. - **20230927-yolov8-v1.21** 1. 使用YOLO-MS中的MSBlock改进C2f和C3模块,具体请看使用教程. 2. 使用GCNet中的Light-weight Context Guided改进C2f和C3模块,具体请看使用教程. 3. 使用GCNet中的Light-weight Context Guided Down替换YOLO中的下采样模块,具体请看使用教程. - **20231010-yolov8-v1.22** 1. RepViT同步官方源码. 2. 经实验发现网络全使用C2f-MSBlock和C3-MSBlock不稳定,因此在Neck部分还是使用C2f或C3,具体可参看对应的配置文件. 3. 支持deformableLKA注意力机制,并进行改进C2f和C3,提出C2f_DLKA,C3_DLKA. 4. 使用DAMO-YOLO中的RepGFPN改进yolov8中的Neck. 5. 使用YOLOV6中的EfficientRepBiPAN改进yolov8中的Neck. 6. 新增支持SPDConv进行下采样. 7. 使用Efficientnet中的MBConv与EffectiveSE改进C2f和C3. - **20231020-yolov8-v1.23** 1. 更新使用教程和百度云视频.(更新DAttention使用说明视频). 2. 增加LSKA, SegNext_Attention, DAttention(Vision Transformer with Deformable Attention CVPR2022). 3. 使用LSKA改进SPPF,增强多尺度特征提取能力. 4. 使用[Vision Transformer with Deformable Attention(CVPR2022)]改进C2f,C3. - **20231107-yolov8-v1.24** 1. 新增CVPR2022-CSwinTransformer主干. 2. 新增yolov5-AIFI.yaml,yolov8-AIFI.yaml. 3. 新增使用ParC-Net中的位置感知循环卷积改进C3,C2f. 4. 新增使用DWRSeg中的Dilation-wise Residual(DWR)模块,加强从网络高层的可扩展感受野中提取特征.(yolov5-C3-DWR.yaml,yolov8-C2f-DWR.yaml) 5. 把当前所有的改进同步到ultralytics-8.0.202版本上. 6. 更新新版百度云链接视频. 7. 新增热力图、FPS脚本. - **20231114-yolov8-v1.25** 1. 新增EIou,SIou. 2. 新增Inner-IoU,Inner-GIoU,Inner-DIoU,Inner-CIoU,Inner-EIoU,Inner-SIoU. 3. 使用今年最新的MPDIoU与Inner-IoU相结合得到Inner-MPDIoU. 4. 新增[FLatten Transformer(ICCV2023)](https://github.com/LeapLabTHU/FLatten-Transformer)中的FocusedLinearAttention改进C3,C2f. 5. 更新get_FPS脚本中的模型导入方式,避免一些device报错. 6. 更新百度云链接视频-20231114版本更新说明. - **20231114-yolov8-v1.26** 1. 修正MPDIOU中的mpdiou_hw参数. 2. 更新使用教程. - **20231129-yolov8-v1.27** 1. 新增Mixed Local Channel Attention改进C2f和C3. 2. 新增AKConv改进C2f和C3. 3. 更新使用教程. 4. 更新百度云链接视频-20231129版本更新说明. - **20231207-yolov8-v1.28** 1. 新增支持2023最新大卷积核CNN架构RepLKNet升级版-UniRepLKNet. 2. 新增UniRepLKNet中的[UniRepLKNetBlock, DilatedReparamBlock]改进C3和C2f. 3. 使用UniRepLKNet中的DilatedReparamBlock对DWRSeg中的Dilation-wise Residual(DWR)模块进行二次创新后改进C3和C2f. 4. 修复get_FPS.py测速前没有进行fuse的问题. 5. 更新使用教程. 6. 更新百度云链接视频-20231207版本更新说明. - **20231217-yolov8-v1.29** 1. 新增ASF-YOLO中的Attentional Scale Sequence Fusion,并在其基础上增加P2检测层并进行优化网络结构. 2. 新增使用DualConv打造CSP Efficient Dual Layer Aggregation Networks. 3. 更新使用教程. 4. 更新百度云链接视频-20231217版本更新说明. - **20231227-yolov8-v1.30** 1. 新增支持TransNeXt主干和TransNeXt中的聚焦感知注意力机制. 2. 新增U-NetV2中的Semantics and Detail Infusion Module,分别对BIFPN和PAFPN中的feature fusion部分进行二次创新. 3. 更新使用教程. 4. 更新百度云链接视频-20231227版本更新说明. - **20240104-yolov8-v1.31** 1. 新增Shape-IoU,Inner-Shape-IoU. 2. 更新使用教程. 3. 更新百度云链接视频-20230104版本更新说明. - **20240111-yolov8-v1.32** 1. 支持FocalLoss,VarifocalLoss,QualityfocalLoss. 2. 支持Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 3. 支持Inner-Wise-IoU(v1,v2,v3)系列(IoU,WIoU,EIoU,GIoU,DIoU,CIoU,SIoU,MPDIoU,ShapeIoU). 4. 更新使用教程. 5. 更新百度云链接视频-20230111版本更新说明. - **20240116-yolov8-v1.33** 1. 使用ASF-YOLO中Attentional Scale Sequence Fusion与GOLD-YOLO中的Gatherand-Distribute进行二次创新结合. 2. 支持最新的DCNV4,C2f-DCNV4,C3-DCNV4,并使用DCNV4对DyHead进行二次创新(DyHead_DCNV4). 3. 修复不使用wise的情况下断点续训的bug. 4. 更新使用教程. 5. 更新百度云链接视频-20230116版本更新说明. - **20240122-yolov8-v1.34** 1. 使用[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN改进YOLOV5、YOLOV8中的Neck. 2. 对[MFDS-DETR](https://github.com/JustlfC03/MFDS-DETR)中的HS-FPN进行二次创新后得到HSPAN改进YOLOV5、YOLOV8中的Neck. 3. 增加CARAFE轻量化上采样算子. 4. 增加DySample(ICCV2023)动态上采样算子. 5. 增加Haar wavelet downsampling下采样算子. 6. 支持soft-nms.(IoU,GIoU,DIoU,CIoU,EIoU,SIoU,ShapeIoU) 7. 更新使用教程. 8. 更新百度云链接视频-20230122版本更新说明. - **20240203-yolov8-v1.35** 1. 增加Focaler-IoU(IoU,GIoU,DIoU,CIoU,EIoU,SIoU,WIoU,MPDIoU,ShapeIoU). 2. 增加RepGFPN与DySample的二次创新组合. 3. 增加ASF-YOLO中的ASSF与DySample的二次创新组合. 4. 增加HS-PAN与DySample的二次创新组合. 5. 使用遮挡感知注意力SEAM,MultiSEAM改进Head,得到具有遮挡感知识别的SEAMHead,MultiSEAMHead. 6. 优化plot_result.py,使用线性插值来填充inf或者nan的数据,降低出现乱码问题的概率. 7. 更新使用教程. 8. 更新百度云链接视频-20230203版本更新说明. - **20240208-yolov8-v1.36** 1. 将所有改进代码同步到8.1.9上. - **20240216-yolov8-v1.37** 1. 增加EMO模型中的iRMB模块,并使用(EfficientViT-CVPR2023)中的CascadedAttention对其二次创新得到iRMB_Cascaded. 2. 新增Shift-ConvNets相关改进内容.(rtdetr-SWC.yaml,rtdetr-R50-SWC.yaml,yolov8-detr-C2f-SWC.yaml,yolov5-detr-C3-SWC.yaml) 3. 使用UniRepLKNet中的DilatedReparamBlock对EMO中的iRMB进行二次创新. 4. 使用Shift-ConvNets中的具有移位操作的卷积对EMO中的iRMB进行二次创新. 5. 修复一些已知问题. 6. 更新使用教程. 8. 百度云视频增加20240216更新说明. - **20240219-yolov8-v1.38** 1. 使用最新的Mamba架构(号称超越Transformer的新架构)改进C2f(提供两种改进方式). 2. 新增Powerful-IoU,Powerful-IoUV2,Inner-Powerful-IoU,Inner-Powerful-IoUV2,Focaler-Powerful-IoU,Focaler-Powerful-IoUV2,Wise-Powerful-IoU(v1,v2,v3),Wise-Powerful-IoUV2(v1,v2,v3)系列. 3. 修复一些已知问题. 4. 更新使用教程. 5. 百度云视频增加20240219更新说明. - **20240222-yolov8-v1.39** 1. 新增YOLOV9中的RepNCSPELAN模块. 2. 使用DBB,OREPA,DilatedReparamBlock对YOLOV9中的RepNCSPELAN模块进行二次创新. 3. 更新使用教程. 4. 百度云视频增加20240222更新说明. - **20240229-yolov8-v1.40** 1. 新增YOLOV9中的ADown下采样模块. 2. 新增YOLOV7中的下采样模块. 3. 新增YOLOV9中的programmable gradient information,并且PGI模块可以在训练后去除. 4. 更新使用教程. 5. 百度云视频增加20240229更新说明. - **20240303-yolov8-v1.41** 1. 新增CVPR2024-parameternet中的GhostModule与DynamicConv. 2. 使用CVPR2024-parameternet中的DynamicConv对CVPR2024-RTDETR中的HGBlokc进行二次创新. 3. 更新使用教程. 4. 百度云视频增加20240303更新说明. - **20240309-yolov8-v1.42** 1. 新增拆分CVPR2024 RepVIT里面的block,提出C2f-RVB、C2f-RVB-EMA. 2. 新增Lightweight Object Detection论文中的Dynamic Group Convolution Shuffle Transformer. 3. 新增自研Lightweight Shared Convolutional Detection Head,支持Detect、Seg、Pose、Obb. 4. 更新使用教程. 5. 百度云视频增加20240309更新说明. - **20240314-yolov8-v1.43** 1. 新增自研Task Align Dynamic Detection Head,支持Detect、Seg、Pose、Obb. 2. 更新使用教程,新增几个常见疑问回答. 3. 修复shapeiou调用不生效的问题. 4. 百度云视频增加20240314更新说明. - **20240323-yolov8-v1.44** 1. 新增CVPR2024-RMT主干,并支持RetBlock改进C3、C2f. 2. 新增2024年新出的Efficient Local Attention,并用其对HSFPN进行二次创新,并加入自研检测头TADDH. 3. 使用CVPR2021-CoordAttention对HSFPN进行二次创新. 4. 更新使用教程,增加多个常见疑问解答. 5. 百度云视频增加20240323更新说明. - **20240330-yolov8-v1.45** 1. 新增CVPR2024 PKINet主干. 2. 新增CVPR2024 PKINet中的PKIModule和CAA模块,提出C2f-PKI. 3. 使用CVPR2024 PKINet中的Context Anchor Attention改进RepNCSPELAN、HSFPN. 4. 更新使用教程. 5. 百度云视频增加20240330更新说明. - **20240406-yolov8-v1.46** 1. 新增CVPR2024 Frequency-Adaptive Dilated Convolution. 2. 新增自研Focusing Diffusion Pyramid Network. 3. 更新使用教程. 4. 百度云视频增加20240406更新说明. - **20240408-yolov8-v1.47** 1. 修复自研Focusing Diffusion Pyramid Network的一个小bug. 2. 新增使用自研特征聚焦扩散金字塔网络和自研任务对齐动态检测头相结合的配置文件yolov8-FDPN-TADDH.yaml 3. 新增HCFNet针对小目标分割的Parallelized Patch-Aware Attention Module改进C2f. 4. 新增HCFNet针对小目标分割的Dimension-Aware Selective Integration Module对自研Focusing Diffusion Pyramid Network再次进行创新. 5. 更新使用教程. 6. 百度云视频增加20240408更新说明. - **20240414-yolov8-v1.48** 1. 新增Cross-Scale Mutil-Head Self-Attention,对Mutil-Head Self-Attention进行二次创新. 2. 更新使用教程. 3. 百度云视频增加20240414更新说明. - **20240420-yolov8-v1.49** 1. 新增A Robust Feature Downsampling Module for Remote Sensing Visual Tasks中的下采样. 2. 新增Context and Spatial Feature Calibration for Real-Time Semantic Segmentation中的Context and Spatial Feature Calibration. 3. 更新使用教程. 4. 百度云视频增加20240420更新说明. - **20240428-yolov8-v1.50** 1. 修复20240420更新中的Context and Spatial Feature Calibration序号错误问题. 2. 新增支持mobilenetv4-backbone. 3. 新增支持content-guided attention fusion改进yolov8-neck. 4. 新增支持使用CAFM对CGAFusion进行二次改进,得到CAFMFusion改进yolov8-neck. 5. 更新使用教程. 6. 百度云视频增加20240428更新说明. - **20240501-yolov8-v1.51** 1. get_FPS.py脚本新增可以通过yaml测试推理速度. 2. 新增自研RGCSPELAN,其比C3、ELAN、C2f、RepNCSPELAN更低参数量和计算量更快推理速度. 3. 更新使用教程. 4. 百度云视频增加20240501更新说明. - **20240505-yolov8-v1.52** 1. 新增LADH.(Lightweight Asymmetric Detection Head). 2. 使用CVPR2024-TransNext中的Convolutional GLU对CVPR2023-FasterBlock进行二次创新. 3. 更新使用教程. 4. 百度云视频增加20240505更新说明. - **20240512-yolov8-v1.53** 1. 基于LSCD自研轻量化检测头再次进行改进得到LSCSBD. 2. 新增PSFusion中的superficial detail fusion module、profound semantic fusion module改进yolov8-neck. 3. 更新使用教程. 4. 百度云视频增加20240512更新说明. - **20240513-yolov8-v1.54** 1. 支持CVPR2024-StarNet,新一代SOTA轻量化模型. 2. 使用CVPR2024-StarNet对C2f进行创新得到C2f-Star. 3. 使用CVPR2024-StarNet与CVPR2024-PKINet进行组合创新得到C2f-Star-CAA. 4. 增加轻量化模型组合配置文件,融合StarNet、C2f-Star、LSCD. 5. 更新使用教程. 6. 百度云视频增加20240513更新说明. - **20240523-yolov8-v1.55** 1. KAN In! Mamba Out!,集成pytorch-kan-conv,支持多种KAN变种! 2. 同步DCNV4-CVPR2024最新代码. 3. 修复AIFI在某些组合会报错的问题. 4. 更新使用教程. 5. 百度云视频增加20240523更新说明. - **20240526-yolov8-v1.56** 1. 支持YOLOV8-NMSFree,仿照yolov10的思想采用双重标签分配和一致匹配度量进行训练,后处理不需要NMS! 2. 新增边缘信息增强模块自研模块,EIEStem、EIEM。 3. 更新使用教程. 4. 百度云视频增加20240526更新说明. - **20240601-yolov8-v1.57** 1. 新增自研ContextGuideFPN. 2. 新增detail-enhanced convolution改进c2f. 3. 新增自研LSDECD,在LSCD的基础上引入可重参数化的detail-enhanced convolution. 4. 新增自研SMPCGLU,里面的模块分别来自CVPR2023和CVPR2024. 5. 更新使用教程. 6. 百度云视频增加20240601更新说明. - **20240609-yolov8-v1.58** 1. 新增支持物理传热启发的视觉表征模型vHeat中的vHeatBlock. 2. 新增自研重校准特征金字塔网络(Re-CalibrationFPN),推出多个版本(P2345,P345,P3456). 3. 更新使用教程. 4. 百度云视频增加20240609更新说明. - **20240613-yolov8-v1.59** 1. 新增WaveletPool改进上采样和下采样. 2. 新增自研Cross Stage Partial - Partially Transformer Block模块. 3. 更新使用教程. 4. 百度云视频增加20240613更新说明. - **20240619-yolov8-v1.60** 1. 集成mamba-yolo. 2. 新增GLSA改进yolov8-neck. 3. 新增GLSA对BIFPN进行二次创新. 4. 更新使用教程. 5. 百度云视频增加20240619更新说明. - **20240627-yolov8-v1.61** 1. 新增UCTransNet中的ChannelTransformer改进yolov8-neck. 2. 新增自研SmallObjectEnhancePyramid. 3. 更新使用教程. 4. 百度云视频增加20240627更新说明. - **20240707-yolov8-v1.62** 1. 更新使用教程,增加常见疑问. - **20240713-ultralytics-v1.63** 1. ultralytics版本已更新至8.2.50,后续会更新YOLOv8、YOLOv10的改进方案. 2. 新增YOLOV10改进、后续会一步一步更新V10的配置文件.(目前更新了backbone系列,一些自研系列的改进到v10中) 3. 更新使用教程. 4. 百度云视频增加20240713更新说明. 5. 百度云视频更新(断点续训教程、计算COCO指标教程、plot_result.py使用教程、项目使用教程必看系列、YOLOV10版本切换教程一) 6. 补充了EMSC和EMSCP的结构图. - **20240720-ultralytics-v1.64** 1. 修复一些已知问题. 2. 新增自研Context-Guided Spatial Feature Reconstruction Feature Pyramid Network. 3. 新增Wavelet Convolutions for Large Receptive Fields中的WTConv改进C2f. 4. 新增UBRFC-Net中的Adaptive Fine-Grained Channel Attention. 5. 更新使用教程. 6. 百度云视频增加20240720更新说明. 7. 增加v10多个改进、主要是上下采样系列. - **20240729-ultralytics-v1.65** 1. 新增自研FeaturePyramidSharedConv. 2. 新增ECCV2024-SMFANet中的Feature Modulation block. 3. 增加v10多个改进. 4. 更新使用教程. 5. 百度云视频增加20240729更新说明. - **20240803-ultralytics-v1.66** 1. 新增LDConv. 2. 新增Rethinking Performance Gains in Image Dehazing Networks中的gConv. 3. 新增MAF-YOLO中的MAFPN,并利用BIFPN的思想对MAFPN进行二次创新得到BIMAFPN. 4. 更新使用教程. 5. 百度云视频增加20240803更新说明. - **20240813-ultralytics-v1.67** 1. 新增APT-TAL标签分配策略. 2. 新增YOLO-MIF中的WDBB、DeepDBB的重参数化模块. 3. 新增SLAB中的RepBN改进AIFI. 4. 更新使用教程. 5. 百度云视频增加20240813更新说明. - **20240822-ultralytics-v1.68** 1. 新增CAS-ViT的AdditiveBlock. 2. 新增TransNeXt的Convolutional GLU对CAS-ViT的AdditiveBlock进行二次创新. 3. 新增自研Efficient Multi-Branch&Scale FPN. 4. 新增v10多个改进. 5. 更新使用教程. 6. 百度云视频增加20240822更新说明. - **20240831-ultralytics-v1.69** 1. 新增CMTFUnet和TransNext的二次创新模块. 2. 新增自研CSP-Partial Multi-Scale Feature Aggregation. 3. 更新使用教程. 4. 百度云视频增加20240831更新说明. - **20240908-ultralytics-v1.70** 1. 新增Cross-Layer Feature Pyramid Transformer for Small Object Detection in Aerial Images中的CFPT. 2. 新增ICLR2024中的MogaBlock. 3. 新增v10多个改进. 4. 更新使用教程. 5. 百度云视频增加20240908更新说明. - **20240920-ultralytics-v1.71** 1. 新增CVPR2024-SHViT中的SHSABlock和其的二次创新. 2. 新增BIBM2024-SMAFormer中的SMAFormerBlock和其的二次创新. 3. 新增TPAMI2024-FreqFusion中的FreqFusion改进Neck. 4. 新增v10多个改进. 5. 更新使用教程. 6. 百度云视频增加20240920更新说明. - **20241007-ultralytics-v1.72** 1. 新增自研MutilBackBone-DynamicAlignFusion. 2. 新增Metaformer TPAMI2024的IdentityFormer、RandomMixingFormer、PoolingFormer、ConvFormer、CaFormer改进C2f. 3. 新增Metaformer TPAMI2024的IdentityFormer、RandomMixingFormer、PoolingFormer、ConvFormer、CaFormer与CVPR2024-TranXNet的二次创新模块改进C2f. 4. 更新使用教程. 5. 百度云视频增加20241007更新说明. - **20241024-ultralytics-v1.73** 1. 增加v10多个改进. 2. 新增自研CSP-MutilScaleEdgeInformationEnhance. 3. 新增Efficient Frequency-Domain Image Deraining with Contrastive Regularization中的Fused_Fourier_Conv_Mixer. 4. 更新使用教程. 5. 百度云视频增加20241024更新说明. - **20241031-ultralytics-v1.74** 1. 新增v8、v10自研Rep Shared Convolutional Detection Head. 2. 更新使用教程. 3. 百度云视频增加20241031更新说明. - **20241109-ultralytics-v1.75** 1. 新增自研CSP-FreqSpatial. 2. 新增SFHformer ECCV2024中的block改进C2f. 3. 新增Revitalizing Convolutional Network for Image Restoration TPAMI2024中的MSM改进C2f. 4. 增加v10多个改进. 5. 更新使用教程. 6. 百度云视频增加20241109更新说明. - **20241122-ultralytics-v1.76** 1. 基于自研CSP-MutilScaleEdgeInformationEnhance再次创新得到CSP-MutilScaleEdgeInformationSelect. 2. 新增Pattern Recognition 2024|DRANet中的HDRAB和RAB模块改进C2f. 3. 新增ECCV2022-ELAN中的Local feature extraction改进C2f. 4. 增加v10多个改进. 5. 更新使用教程. 6. 百度云视频增加20241122更新说明. - **20241204-ultralytics-v1.77** 1. 新增自研GlobalEdgeInformationTransfer. 2. 新增FreqFormer的Frequency-aware Cascade Attention改进C2f. 3. 更新使用教程. 4. 百度云视频增加20241204更新说明. - **20241219-ultralytics-v1.78** 1. 新增CAMixerSR中的CAMixer改进C2f. 2. 新增支持Hyper-YOLO,并可以利用项目自带的改进改进Hyper-YOLO. 3. 新增Hyper-YOLO中的Hypergraph Computation in Semantic Space和Mixed Aggregation Network的改进. 4. 更新使用教程. 5. 百度云视频增加20241219更新说明. - **20250101-ultralytics-v1.79** 1. 新增基于Hyper-YOLO中的Mixed Aggregation Network三个二次改进系列. 2. 新增使用MSA^2 Net中的Multi-Scale Adaptive Spatial Attention Gate改进yolo11-neck. 3. 新增使用MSA^2 Net中的Multi-Scale Adaptive Spatial Attention Gate改进自研系列的MutilBackbone. 4. 更新使用教程. 5. 百度云视频增加20250101更新说明. - **20250119-ultralytics-v1.80** 1. 新增CRAFT-SR中的high-frequency enhancement residual block. 2. 新增AAAI2025-TBSN中的DTAB. 3. 新增ECCV2024-FSEL中的多个模块. 4. 新增ACMMM2024-WFEN中的小波变换特征融合. 5. 新增AAAI2025 Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection中的Pinwheel-shaped Convolution类型改进. 6. 新增AAAI2025 ConDSeg中的ContrastDrivenFeatureAggregation与ACMMM2024 WFEN中的小波变换进行创新. 7. 更新使用教程. 8. 百度云视频增加20250119更新说明. - **20250207-ultralytics-v1.81** 1. 新增遥感目标检测Strip R-CNN中的StripBlock及其二次创新. 2. 新增BIBM2024 Spatial-Frequency Dual Domain Attention Network For Medical Image Segmentation中的Frequency-Spatial Attention和Multi-scale Progressive Channel Attention. 3. 新增ICLR2025 Kolmogorov-Arnold Transformer中的KAT及其配合FasterBlock的二次创新.<此模块需要编译> 4. 更新使用教程. 5. 百度云视频增加20250207更新说明. - **20250220-ultralytics-v1.82** 1. 新增自研模块DynamicInceptionDWConv2d. 2. 新增GlobalFilter和DynamicFilter. 3. 更新使用教程. 4. 百度云视频增加20250220更新说明. - **20250308-ultralytics-v1.83** 1. 新增自研模块Hierarchical Attention Fusion并提供多种使用方式. 2. 新增ICLR2025-Token Statistics Transformer改进PSA. 3. 新增MHAF-YOLO中的RepHMS.<这个是YOLO群内的一个博士新作品> 4. 更新使用教程. 5. 百度云视频增加20250308更新说明. - **20250323-ultralytics-v1.84** 1. 新增CVPR2025-MambaIR的模块. 2. 新增CVPR2025-SCSegamba中的模块. 3. 新增CVPR2025-MambaOut中的模块. 4. 更新使用教程. 5. 百度云视频增加20250323更新说明. - **20250406-ultralytics-v1.85** 1. 新增CVPR2025-DEIM中的Localization Quality Estimation改进YOLOHead使其分类头同时具备分类score和预测框质量score. 2. 新增Localization Quality Estimation - Lightweight Shared Convolutional Detection Head. 3. 新增CVPR2025-EfficientViM和其与CVPR2024-TransNeXt的二次创新后的模块. 4. 更新使用教程. 5. 百度云视频增加20250406更新说明. - **20250426-ultralytics-v1.86** 1. 新增CVPR2024-EMCAD中的EUCB上采样. 2. 新增CVPR2024-EMCAD与CVPR2025-BHViT的二次创新模块. 3. 新增CVPR2024-DCMPNet的多个模块和二次创新的模块. 4. 新增统计配置文件的计算量和参数量并排序的脚本. 5. 更新使用教程. 6. 百度云视频增加20250426更新说明. - **20250514-ultralytics-v1.87** 1. 新增LEGNet的LoGStem和LFEModule. 2. 新增新一代轻量化SOTA的CVPR2025-LSNet的LSNet和LSConv的多个改进和二次创新改进. 3. 新增CVPR2025-OverLock中的多个模块. 4. 修改保存权重的逻辑,训练结束(注意是正常训练结束后,手动停止的没有)后统一会保存4个模型,分别是best.pt、last.pt、best_fp32.pt、last_fp32.pt,其中不带fp32后缀的是fp16格式保存的,但由于有些模块对fp16非常敏感,会出现后续使用val.py的时候精度为0的情况,这种情况下可以用后缀带fp32去测试。 5. 更新使用教程. 6. 百度云视频增加20250514更新说明. - **20250601-ultralytics-v1.88** 1. 新增TransMamba的改进. 2. 新增CVPR2025-DarkIR的改进. 3. 新增CVPR2025-EVSSM的改进. 4. 更新使用教程. 5. 百度云视频增加20250601更新说明. - **20250629-ultralytics-v1.89** 1. 新增ECCV2024-rethinkingfpn中的模块,并对原创改进SOEP再次创新。 2. 新增CVPR2024-SFSConv的模块. 3. 新增CVPR2025-GroupMamba中的模块. 4. 新增CVPR2025-MambaVision中的模块. 5. 新增AAAI2025-FBRTYOLO中的模块. 6. 更新使用教程. 7. 百度云视频增加20250629更新说明. 8. 修复在torch2.6.0以及以上的版本会出现模型读取失败的问题. - **20250727-ultralytics-v1.90** 1. 新增Pyramid Sparse Transformer改进yolo11-neck. 2. 新增Pyramid Sparse Transformer对SOEP再创新. 3. 新增MIA2025-FourierConv. 4. 新增AAAI2025的HS-FPN. 5. 新增TGRS2025-UMFormer中的模块. 6. 更新使用教程. 7. 百度云视频增加20250727更新说明. - **20250822-ultralytics-v1.91** 1. 新增ICCV2025-ESC中的多个改进。 2. 新增ICCV2025-UniConvBlock中的改进。 3. 更新使用教程. 4. 百度云视频增加20250822更新说明. - **20250919-ultralytics-v1.92** 1. 新增CVPR2025-GCConv模块. 2. 新增AAAI2024-CFBlock模块. 3. 新增ICCV2023-FastViT中的RepStem模块. 4. 更新使用教程. 5. 百度云视频增加20250919更新说明. - **20251028-ultralytics-v1.93** 1. 新增TGRS2025-ASCNet中的模块. 2. 新增ICCV2025-HFRB模块. 3. 新增ICIP2025-BEVANET中的模块. 4. 更新使用教程. 5. 百度云视频增加20251028更新说明. - **20251129-ultralytics-v1.94** 1. 新增GRSL2025-Gaussian Combined Distance,支持在目标框损失和标签分配策略上更改,详细请看LOSS改进系列.md 2. 新增ACCV2024-PlainUSR中的模块. 3. 更新使用教程. 4. 百度云视频增加20251129更新说明. - **20260118-ultralytics-v1.95** 1. 新增CVPR2025-HVI中的LCA、IEL模块. 2. 新增TGRS2025-HAFNet中的HFFE模块. 3. 更新使用教程. 4. 百度云视频增加20260118更新说明. - **20260227-ultralytics-v1.96** 1. 优化detect.py中的特征图保存机制,使其可以单独保存每一个通道的特征图和总通道求和的特征图. 2. 优化训练过程的输出,增加训练过程中的mAP75输出. ================================================ FILE: yolo-improve/yolov9-backbone/yolo.py ================================================ def _forward_once(self, x, profile=False, visualize=False): y, dt = [], [] # outputs for m in self.model: if m.f != -1: # if not from previous layer x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers if profile: self._profile_one_layer(m, x, dt) if hasattr(m, 'backbone'): x = m(x) for _ in range(5 - len(x)): x.insert(0, None) have_silence = False if len(y) == 1: have_silence = True for i_idx, i in enumerate(x): if have_silence: i_idx += 1 if i_idx in self.save: y.append(i) else: y.append(None) x = x[-1] else: x = m(x) # run y.append(x if m.i in self.save else None) # save output if visualize: feature_visualization(x, m.type, m.i, save_dir=visualize) return x def parse_model(d, ch): # model_dict, input_channels(3) # Parse a YOLO model.yaml dictionary LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}") anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation') if act: Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU() RepConvN.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = nn.SiLU() LOGGER.info(f"{colorstr('activation:')} {act}") # print na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors no = na * (nc + 5) # number of outputs = anchors * (classes + 5) is_backbone = False layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args try: t = m m = eval(m) if isinstance(m, str) else m # eval strings except: pass for j, a in enumerate(args): with contextlib.suppress(NameError): args[j] = eval(a) if isinstance(a, str) else a # eval strings n = n_ = max(round(n * gd), 1) if n > 1 else n # depth gain if m in { Conv, AConv, ConvTranspose, Bottleneck, SPP, SPPF, DWConv, BottleneckCSP, nn.ConvTranspose2d, DWConvTranspose2d, SPPCSPC, ADown, RepNCSPELAN4, SPPELAN}: c1, c2 = ch[f], args[0] if c2 != no: # if not output c2 = make_divisible(c2 * gw, 8) args = [c1, c2, *args[1:]] if m in {BottleneckCSP, SPPCSPC}: args.insert(2, n) # number of repeats n = 1 elif m is nn.BatchNorm2d: args = [ch[f]] elif m is Concat: c2 = sum(ch[x] for x in f) elif m is Shortcut: c2 = ch[f[0]] elif m is ReOrg: c2 = ch[f] * 4 elif m is CBLinear: c2 = args[0] c1 = ch[f] args = [c1, c2, *args[1:]] elif m is CBFuse: c2 = ch[f[-1]] # TODO: channel, gw, gd elif m in {Detect, DualDetect, TripleDetect, DDetect, DualDDetect, TripleDDetect, Segment, DSegment, DualDSegment, Panoptic}: args.append([ch[x] for x in f]) # if isinstance(args[1], int): # number of anchors # args[1] = [list(range(args[1] * 2))] * len(f) if m in {Segment, DSegment, DualDSegment, Panoptic}: args[2] = make_divisible(args[2] * gw, 8) elif m is Contract: c2 = ch[f] * args[0] ** 2 elif m is Expand: c2 = ch[f] // args[0] ** 2 elif isinstance(m, str): t = m m = timm.create_model(m, pretrained=args[0], features_only=True) c2 = m.feature_info.channels() # elif m in {}: # m = m(*args) # c2 = m.channel else: c2 = ch[f] if isinstance(c2, list) and m not in {CBLinear, }: is_backbone = True m_ = m m_.backbone = True else: m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module t = str(m)[8:-2].replace('__main__.', '') # module type np = sum(x.numel() for x in m_.parameters()) # number params m_.i, m_.f, m_.type, m_.np = i + 4 if is_backbone else i, f, t, np # attach index, 'from' index, type, number params LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f} {t:<40}{str(args):<30}') # print save.extend(x % (i + 4 if is_backbone else i) for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist layers.append(m_) if i == 0: ch = [] if isinstance(c2, list) and m not in {CBLinear, }: for _ in range(5 - len(c2)): c2.insert(0, 0) ch.extend(c2) else: ch.append(c2) return nn.Sequential(*layers), sorted(save) ================================================ FILE: yolo-improve/yolov9-backbone/yolov9-c-custom.yaml ================================================ # YOLOv9 # parameters nc: 80 # number of classes depth_multiple: 1.0 # model depth multiple width_multiple: 1.0 # layer channel multiple #activation: nn.LeakyReLU(0.1) #activation: nn.ReLU() # anchors anchors: 3 # 1-P1/2 # 2-P2/4 # 3-P3/8 # 4-P4/16 # 5-P5/32 # YOLOv9 backbone backbone: [ [-1, 1, Silence, []], # 0 [-1, 1, mobilenetv2_035, [False]] # 5 ] # YOLOv9 head head: [ # elan-spp block [-1, 1, SPPELAN, [512, 256]], # 6 # up-concat merge [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 7 [[-1, 4], 1, Concat, [1]], # cat backbone P4 8 # elan-2 block [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 9 # up-concat merge [-1, 1, nn.Upsample, [None, 2, 'nearest']], # 10 [[-1, 3], 1, Concat, [1]], # cat backbone P3 11 # elan-2 block [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]], # 12 (P3/8-small) # avg-conv-down merge [-1, 1, ADown, [256]], # 13 [[-1, 9], 1, Concat, [1]], # cat head P4 14 # elan-2 block [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 15 (P4/16-medium) # avg-conv-down merge [-1, 1, ADown, [512]], # 16 [[-1, 6], 1, Concat, [1]], # cat head P5 17 # elan-2 block [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 18 (P5/32-large) # multi-level reversible auxiliary branch # routing [3, 1, CBLinear, [[256]]], # 19 [4, 1, CBLinear, [[256, 512]]], # 20 [5, 1, CBLinear, [[256, 512, 512]]], # 21 # conv down [0, 1, Conv, [64, 3, 2]], # 22-P1/2 # conv down [-1, 1, Conv, [128, 3, 2]], # 23-P2/4 # elan-1 block [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 24 # avg-conv down fuse [-1, 1, ADown, [256]], # 25-P3/8 [[19, 20, 21, -1], 1, CBFuse, [[0, 0, 0]]], # 26 # elan-2 block [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 27 # avg-conv down fuse [-1, 1, ADown, [512]], # 28-P4/16 [[20, 21, -1], 1, CBFuse, [[1, 1]]], # 29 # elan-2 block [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 30 # avg-conv down fuse [-1, 1, ADown, [512]], # 31-P5/32 [[21, -1], 1, CBFuse, [[2]]], # 32 # elan-2 block [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 33 # detect [[27, 30, 33, 12, 15, 18], 1, DualDDetect, [nc]], # DualDDetect(A3, A4, A5, P3, P4, P5) ]