Repository: wang-xinyu/tensorrtx Branch: master Commit: 2990f34a8502 Files: 744 Total size: 5.6 MB Directory structure: gitextract_0y61g4fh/ ├── .clang-format ├── .cmake-format.yaml ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ └── tensorrtx-issue-template.md │ ├── stale.yml │ └── workflows/ │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── alexnet/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── alexnet.cc │ ├── alexnet.py │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ └── utils.h ├── arcface/ │ ├── CMakeLists.txt │ ├── README.md │ ├── arcface-mobilefacenet.cpp │ ├── arcface-r100.cpp │ ├── arcface-r50.cpp │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── prelu.cu │ └── prelu.h ├── assets/ │ └── 6.pgm ├── centernet/ │ ├── README.md │ ├── centernet.py │ ├── dcnv2Plugin/ │ │ ├── CMakeLists.txt │ │ ├── dcn_v2_im2col_cuda.cu │ │ ├── dcn_v2_im2col_cuda.h │ │ ├── dcnv2Plugin.cpp │ │ └── dcnv2Plugin.h │ └── sample/ │ ├── common.py │ └── test.py ├── contributing.md ├── convnextv2/ │ ├── CMakeLists.txt │ ├── README.md │ ├── config.yaml │ ├── gen_wts.py │ ├── inference.py │ └── src/ │ ├── LayerNormPlugin.cu │ ├── LayerNormPlugin.h │ ├── convnextv2.cpp │ ├── inference_cpp.cpp │ └── logging.h ├── crnn/ │ ├── CMakeLists.txt │ ├── README.md │ ├── crnn.cpp │ ├── genwts.py │ └── logging.h ├── csrnet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── config.h │ ├── csrnet.cpp │ ├── gen_wts.py │ ├── logging.h │ └── macros.h ├── dbnet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── clipper/ │ │ ├── CMakeLists.txt │ │ ├── clipper.cpp │ │ └── clipper.hpp │ ├── common.hpp │ ├── dbnet.cpp │ ├── logging.h │ └── utils.h ├── densenet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── densenet121.cpp │ ├── densenet121.py │ └── logging.h ├── detr/ │ ├── CMakeLists.txt │ ├── README.md │ ├── backbone.hpp │ ├── calibrator.hpp │ ├── common.hpp │ ├── detr.cpp │ ├── gen_wts.py │ ├── logging.h │ └── macros.h ├── docker/ │ ├── README.md │ ├── tensorrtx-docker-compose.yml │ └── x86_64.dockerfile ├── efficient_ad/ │ ├── CMakeLists.txt │ ├── README.md │ ├── efficientAD_det.cpp │ └── src/ │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.cpp │ ├── model.h │ ├── postprocess.h │ └── utils.h ├── efficientnet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── efficientnet.cpp │ ├── gen_wts.py │ ├── logging.h │ └── utils.hpp ├── ghostnet/ │ ├── README.md │ ├── ghostnetv1/ │ │ ├── CMakeLists.txt │ │ ├── gen_wts.py │ │ ├── ghostnetv1.cpp │ │ └── logging.h │ └── ghostnetv2/ │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── ghostnetv2.cpp │ └── logging.h ├── googlenet/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── gen_wts.py │ ├── googlenet.cpp │ ├── logging.h │ ├── macros.h │ └── utils.h ├── hrnet/ │ ├── hrnet-image-classification/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── common.hpp │ │ ├── demo.py │ │ ├── hrnet.cpp │ │ └── logging.h │ └── hrnet-semantic-segmentation/ │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── gen_wts.py │ ├── hrnet.cpp │ ├── hrnet_ocr.cpp │ ├── hrnet_trt.py │ └── logging.h ├── ibnnet/ │ ├── CMakeLists.txt │ ├── InferenceEngine.cpp │ ├── InferenceEngine.h │ ├── README.md │ ├── gen_wts.py │ ├── holder.h │ ├── ibnnet.cpp │ ├── ibnnet.h │ ├── layers.cpp │ ├── layers.h │ ├── logging.h │ ├── main.cpp │ ├── utils.cpp │ └── utils.h ├── inception/ │ ├── inceptionv3/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── inception_v3.cpp │ │ └── logging.h │ └── inceptionv4/ │ ├── CMakeLists.txt │ ├── README.md │ ├── inception_v4.cpp │ ├── inception_v4.h │ ├── layers_api.cpp │ ├── layers_api.h │ ├── logging.h │ ├── main.cpp │ ├── utils.cpp │ └── utils.h ├── lenet/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── gen_wts.py │ ├── lenet.cpp │ ├── lenet.py │ ├── lenet_tripy.py │ ├── logging.h │ ├── macros.h │ └── utils.h ├── lprnet/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── lprnet.cpp │ ├── macros.h │ └── utils.h ├── mlp/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── logging.h │ ├── macros.h │ ├── mlp.cpp │ ├── mlp.py │ └── utils.h ├── mnasnet/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── mnasnet.cpp │ └── utils.h ├── mobilenet/ │ ├── mobilenetv2/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── logging.h │ │ ├── mobilenet_v2.cpp │ │ └── mobilenet_v2.py │ └── mobilenetv3/ │ ├── CMakeLists.txt │ ├── README.md │ ├── logging.h │ ├── mobilenet_v3.cpp │ └── mobilenet_v3.py ├── psenet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_tf_wts.py │ ├── layers.cpp │ ├── layers.h │ ├── main.cpp │ ├── psenet.cpp │ ├── psenet.h │ ├── utils.cpp │ └── utils.h ├── rcnn/ │ ├── BatchedNms.cu │ ├── BatchedNmsPlugin.h │ ├── CMakeLists.txt │ ├── MaskRcnnInference.cu │ ├── MaskRcnnInferencePlugin.h │ ├── PredictorDecode.cu │ ├── PredictorDecodePlugin.h │ ├── README.md │ ├── RoiAlign.cu │ ├── RoiAlignPlugin.h │ ├── RpnDecode.cu │ ├── RpnDecodePlugin.h │ ├── RpnNms.cu │ ├── RpnNmsPlugin.h │ ├── backbone.hpp │ ├── calibrator.hpp │ ├── common.hpp │ ├── cuda_utils.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ └── rcnn.cpp ├── real-esrgan/ │ ├── general-x4v3/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── cmake/ │ │ │ └── FindTensorRT.cmake │ │ ├── gen_wts.py │ │ ├── main.cpp │ │ └── src/ │ │ ├── include/ │ │ │ ├── config/ │ │ │ │ └── config.hpp │ │ │ ├── cuda_utils.h │ │ │ ├── logging/ │ │ │ │ └── logging.h │ │ │ ├── pixel_shuffle/ │ │ │ │ └── pixel_shuffle.hpp │ │ │ └── preprocess/ │ │ │ └── preprocess.hpp │ │ └── pixel_shuffle/ │ │ ├── pixel_shuffle.cpp │ │ └── pixel_shuffle.cu │ └── x4plus/ │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── cuda_utils.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── postprocess.cu │ ├── postprocess.hpp │ ├── preprocess.cu │ ├── preprocess.hpp │ ├── real-esrgan.cpp │ └── utils.h ├── refinedet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── calibrator.cpp │ ├── calibrator.h │ ├── configure.h │ ├── gen_wts_refinedet.py │ ├── logging.h │ ├── refinedet.cpp │ └── utils.h ├── repvgg/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ └── repvgg.cpp ├── resnet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── logging.h │ ├── resnet18.cpp │ ├── resnet34.cpp │ ├── resnet50.cpp │ ├── resnet50.py │ ├── resnext50_32x4d.cpp │ ├── wide_resnet50.py │ └── wideresnet50.cpp ├── retinaface/ │ ├── CMakeLists.txt │ ├── README.md │ ├── calibrator.cpp │ ├── calibrator.h │ ├── common.hpp │ ├── decode.cu │ ├── decode.h │ ├── logging.h │ ├── macros.h │ ├── retina_mnet.cpp │ ├── retina_r50.cpp │ └── retinaface_trt.py ├── retinafaceAntiCov/ │ ├── CMakeLists.txt │ ├── README.md │ ├── decode.cu │ ├── decode.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ └── retinafaceAntiCov.cpp ├── scaled-yolov4/ │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── gen_wts.py │ ├── logging.h │ ├── mish.cu │ ├── mish.h │ ├── utils.h │ ├── yololayer.cu │ ├── yololayer.h │ └── yolov4_csp.cpp ├── senet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── logging.h │ └── se_resnet50.cpp ├── shufflenetv2/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── shufflenetv2.cpp │ └── utils.h ├── squeezenet/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── squeezenet.cpp │ └── utils.h ├── superpoint/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── supernet.cpp │ ├── utils.cpp │ └── utils.h ├── swin-transformer/ │ └── semantic-segmentation/ │ ├── CMakeLists.txt │ ├── README.md │ ├── UpsampleKernel.cu │ ├── UpsamplePlugin.cpp │ ├── UpsamplePlugin.h │ ├── UpsmapleKernel.h │ ├── common.hpp │ ├── fillmask.cu │ ├── fillmask.h │ ├── gelu.cu │ ├── gelu.h │ ├── gen_wts.py │ ├── include/ │ │ └── dirent.h │ ├── layerNorm.cu │ ├── layerNorm.h │ ├── logging.h │ ├── main.cpp │ ├── myhpp.h │ ├── trainsform.cpp │ └── utilsn.h ├── tsm/ │ ├── CMakeLists.txt │ ├── README.md │ ├── demo.sh │ ├── gen_wts.py │ ├── logging.h │ ├── mmaction2_tsm_r50_config.py │ ├── test_shift.py │ ├── tsm_r50.cpp │ └── tsm_r50.py ├── tutorials/ │ ├── check_fp16_int8_support.md │ ├── faq.md │ ├── from_pytorch_to_trt_stepbystep_hrnet.md │ ├── getting_started.md │ ├── install.md │ ├── measure_performance.md │ ├── migration_guide.md │ ├── multi_GPU_processing.md │ └── run_on_windows.md ├── ufld/ │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── gen_wts.py │ ├── lane_det.cpp │ ├── logging.h │ ├── macros.h │ └── pth2onnx.py ├── unet/ │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ └── unet.cpp ├── vgg/ │ ├── CMakeLists.txt │ ├── README.md │ ├── logging.h │ └── vgg11.cpp ├── vit/ │ ├── CMakeLists.txt │ ├── FindTensorRT.cmake │ ├── README.md │ ├── cuda_allocator.cc │ ├── cuda_allocator.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── profiler.cc │ ├── profiler.h │ ├── utils.h │ └── vit.cc ├── yolo11/ │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── readme.md │ ├── src/ │ │ ├── block.cpp │ │ ├── calibrator.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ ├── postprocess.cu │ │ └── preprocess.cu │ ├── yolo11_cls.cpp │ ├── yolo11_cls_trt.py │ ├── yolo11_det.cpp │ ├── yolo11_det_trt.py │ ├── yolo11_obb.cpp │ ├── yolo11_obb_trt.py │ ├── yolo11_pose.cpp │ ├── yolo11_pose_trt.py │ ├── yolo11_seg.cpp │ └── yolo11_seg_trt.py ├── yolo11_tripy/ │ ├── .gitignore │ ├── README.md │ ├── classify.py │ ├── compile_classifier.py │ ├── constants.py │ ├── model/ │ │ ├── block.py │ │ └── model.py │ └── requirements.txt ├── yolo26/ │ ├── .clang-format │ ├── .gitignore │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── src/ │ │ ├── block.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ └── preprocess.cu │ ├── yolo26_cls.cpp │ ├── yolo26_det.cpp │ └── yolo26_obb.cpp ├── yolop/ │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── cuda_utils.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── utils.h │ ├── yololayer.cu │ ├── yololayer.h │ ├── yolop.cpp │ ├── yolop.hpp │ └── yolop_trt.py ├── yolov10/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── src/ │ │ ├── block.cpp │ │ ├── calibrator.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ └── preprocess.cu │ ├── yolov10_det.cpp │ └── yolov10_det_trt.py ├── yolov12/ │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── readme.md │ ├── src/ │ │ ├── block.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ ├── postprocess.cu │ │ └── preprocess.cu │ └── yolo12_det.cpp ├── yolov12-tubro/ │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── readme.md │ ├── src/ │ │ ├── block.cpp │ │ ├── calibrator.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ ├── postprocess.cu │ │ └── preprocess.cu │ ├── yolov12_cls.cpp │ ├── yolov12_cls_trt.py │ ├── yolov12_det.cpp │ ├── yolov12_det_trt.py │ ├── yolov12_seg.cpp │ └── yolov12_seg_trt.py ├── yolov13/ │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── geluKernel.cu │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── readme.md │ ├── src/ │ │ ├── block.cpp │ │ ├── calibrator.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ ├── postprocess.cu │ │ └── preprocess.cu │ ├── yolov13_det.cpp │ └── yolov13_det_trt.py ├── yolov3/ │ ├── CMakeLists.txt │ ├── README.md │ ├── calibrator.cpp │ ├── calibrator.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── utils.h │ ├── yololayer.cu │ ├── yololayer.h │ ├── yolov3.cpp │ └── yolov3_trt.py ├── yolov3-spp/ │ ├── CMakeLists.txt │ ├── README.md │ ├── Utils.h │ ├── gen_wts.py │ ├── logging.h │ ├── yololayer.cu │ ├── yololayer.h │ └── yolov3-spp.cpp ├── yolov3-tiny/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── utils.h │ ├── yololayer.cu │ ├── yololayer.h │ └── yolov3-tiny.cpp ├── yolov4/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── logging.h │ ├── mish.cu │ ├── mish.h │ ├── utils.h │ ├── yololayer.cu │ ├── yololayer.h │ └── yolov4.cpp ├── yolov5/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── src/ │ │ ├── calibrator.cpp │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.cpp │ │ ├── model.h │ │ ├── postprocess.cpp │ │ ├── postprocess.h │ │ ├── preprocess.cu │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── yolov5_cls.cpp │ ├── yolov5_cls_trt.py │ ├── yolov5_det.cpp │ ├── yolov5_det_cuda_python.py │ ├── yolov5_det_trt.py │ ├── yolov5_seg.cpp │ └── yolov5_seg_trt.py ├── yolov5-lite/ │ ├── CMakeLists.txt │ ├── README.md │ ├── calibrator.cpp │ ├── common.hpp │ ├── gen_wts.py │ ├── v5lite.cpp │ ├── yololayer.cu │ └── yolov5-lite-trt.py ├── yolov7/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── main.cpp │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── src/ │ │ ├── block.cpp │ │ ├── calibrator.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ └── preprocess.cu │ └── yolov7_trt.py ├── yolov8/ │ ├── CMakeLists.txt │ ├── README.md │ ├── gen_wts.py │ ├── include/ │ │ ├── block.h │ │ ├── calibrator.h │ │ ├── config.h │ │ ├── cuda_utils.h │ │ ├── logging.h │ │ ├── macros.h │ │ ├── model.h │ │ ├── postprocess.h │ │ ├── preprocess.h │ │ ├── types.h │ │ └── utils.h │ ├── plugin/ │ │ ├── yololayer.cu │ │ └── yololayer.h │ ├── src/ │ │ ├── block.cpp │ │ ├── calibrator.cpp │ │ ├── model.cpp │ │ ├── postprocess.cpp │ │ ├── postprocess.cu │ │ └── preprocess.cu │ ├── yolov8_5u_det.cpp │ ├── yolov8_5u_det_trt.py │ ├── yolov8_cls.cpp │ ├── yolov8_cls_trt.py │ ├── yolov8_det.cpp │ ├── yolov8_det_trt.py │ ├── yolov8_obb.cpp │ ├── yolov8_obb_trt.py │ ├── yolov8_pose.cpp │ ├── yolov8_pose_trt.py │ ├── yolov8_seg.cpp │ └── yolov8_seg_trt.py └── yolov9/ ├── CMakeLists.txt ├── README.md ├── demo.cpp ├── gen_wts.py ├── include/ │ ├── block.h │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── plugin/ │ ├── yololayer.cu │ └── yololayer.h ├── src/ │ ├── block.cpp │ ├── calibrator.cpp │ ├── model.cpp │ ├── postprocess.cpp │ ├── postprocess.cu │ └── preprocess.cu ├── windows/ │ └── dirent.h └── yolov9_trt.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ # Google C/C++ Code Style settings (with 4-space) # Refered to https://github.com/kehanXue/google-style-clang-format/blob/master/.clang-format Language: Cpp BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: None AlignOperands: Align AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: Empty AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Inline AllowShortIfStatementsOnASingleLine: Never # To avoid conflict, set this "Never" and each "if statement" should include brace when coding AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BreakBeforeBraces: Custom BraceWrapping: AfterCaseLabel: false AfterClass: false AfterStruct: false AfterControlStatement: Never AfterEnum: false AfterFunction: false AfterNamespace: false AfterUnion: false AfterExternBlock: false BeforeCatch: false BeforeElse: false BeforeLambdaBody: false IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: None BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon ColumnLimit: 120 CompactNamespaces: false ContinuationIndentWidth: 8 Cpp11BracedListStyle: true DerivePointerAlignment: false # Make sure the * or & align on the left EmptyLineBeforeAccessModifier: LogicalBlock FixNamespaceComments: true IncludeBlocks: Preserve IndentCaseLabels: true IndentPPDirectives: None IndentWidth: 4 KeepEmptyLinesAtTheStartOfBlocks: true MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: true PointerAlignment: Left ReflowComments: false # SeparateDefinitionBlocks: Always # Only support since clang-format 14 SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 2 SpacesInAngles: false SpacesInCStyleCastParentheses: false SpacesInContainerLiterals: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: c++11 TabWidth: 8 UseTab: Never ================================================ FILE: .cmake-format.yaml ================================================ _help_parse: Options affecting listfile parsing parse: _help_additional_commands: - Specify structure for custom cmake functions additional_commands: foo: flags: - BAR - BAZ kwargs: HEADERS: '*' SOURCES: '*' DEPENDS: '*' _help_override_spec: - Override configurations per-command where available override_spec: {} _help_vartags: - Specify variable tags. vartags: [] _help_proptags: - Specify property tags. proptags: [] _help_format: Options affecting formatting. format: _help_disable: - Disable formatting entirely, making cmake-format a no-op disable: false _help_line_width: - How wide to allow formatted cmake files line_width: 80 _help_tab_size: - How many spaces to tab for indent tab_size: 2 _help_use_tabchars: - If true, lines are indented using tab characters (utf-8 - 0x09) instead of space characters (utf-8 0x20). - In cases where the layout would require a fractional tab - character, the behavior of the fractional indentation is - governed by use_tabchars: false _help_fractional_tab_policy: - If is True, then the value of this variable - indicates how fractional indentions are handled during - whitespace replacement. If set to 'use-space', fractional - indentation is left as spaces (utf-8 0x20). If set to - '`round-up` fractional indentation is replaced with a single' - tab character (utf-8 0x09) effectively shifting the column - to the next tabstop fractional_tab_policy: use-space _help_max_subgroups_hwrap: - If an argument group contains more than this many sub-groups - (parg or kwarg groups) then force it to a vertical layout. max_subgroups_hwrap: 2 _help_max_pargs_hwrap: - If a positional argument group contains more than this many - arguments, then force it to a vertical layout. max_pargs_hwrap: 6 _help_max_rows_cmdline: - If a cmdline positional group consumes more than this many - lines without nesting, then invalidate the layout (and nest) max_rows_cmdline: 2 _help_separate_ctrl_name_with_space: - If true, separate flow control names from their parentheses - with a space separate_ctrl_name_with_space: false _help_separate_fn_name_with_space: - If true, separate function names from parentheses with a - space separate_fn_name_with_space: false _help_dangle_parens: - If a statement is wrapped to more than one line, than dangle - the closing parenthesis on its own line. dangle_parens: false _help_dangle_align: - If the trailing parenthesis must be 'dangled' on its on - 'line, then align it to this reference: `prefix`: the start' - 'of the statement, `prefix-indent`: the start of the' - 'statement, plus one indentation level, `child`: align to' - the column of the arguments dangle_align: prefix _help_min_prefix_chars: - If the statement spelling length (including space and - parenthesis) is smaller than this amount, then force reject - nested layouts. min_prefix_chars: 4 _help_max_prefix_chars: - If the statement spelling length (including space and - parenthesis) is larger than the tab width by more than this - amount, then force reject un-nested layouts. max_prefix_chars: 10 _help_max_lines_hwrap: - If a candidate layout is wrapped horizontally but it exceeds - this many lines, then reject the layout. max_lines_hwrap: 2 _help_line_ending: - What style line endings to use in the output. line_ending: unix _help_command_case: - Format command names consistently as 'lower' or 'upper' case command_case: canonical _help_keyword_case: - Format keywords consistently as 'lower' or 'upper' case keyword_case: unchanged _help_always_wrap: - A list of command names which should always be wrapped always_wrap: [] _help_enable_sort: - If true, the argument lists which are known to be sortable - will be sorted lexicographicall enable_sort: true _help_autosort: - If true, the parsers may infer whether or not an argument - list is sortable (without annotation). autosort: false _help_require_valid_layout: - By default, if cmake-format cannot successfully fit - everything into the desired linewidth it will apply the - last, most agressive attempt that it made. If this flag is - True, however, cmake-format will print error, exit with non- - zero status code, and write-out nothing require_valid_layout: false _help_layout_passes: - A dictionary mapping layout nodes to a list of wrap - decisions. See the documentation for more information. layout_passes: {} _help_markup: Options affecting comment reflow and formatting. markup: _help_bullet_char: - What character to use for bulleted lists bullet_char: '*' _help_enum_char: - What character to use as punctuation after numerals in an - enumerated list enum_char: . _help_first_comment_is_literal: - If comment markup is enabled, don't reflow the first comment - block in each listfile. Use this to preserve formatting of - your copyright/license statements. first_comment_is_literal: false _help_literal_comment_pattern: - If comment markup is enabled, don't reflow any comment block - which matches this (regex) pattern. Default is `None` - (disabled). literal_comment_pattern: null _help_fence_pattern: - Regular expression to match preformat fences in comments - default= ``r'^\s*([`~]{3}[`~]*)(.*)$'`` fence_pattern: ^\s*([`~]{3}[`~]*)(.*)$ _help_ruler_pattern: - Regular expression to match rulers in comments default= - '``r''^\s*[^\w\s]{3}.*[^\w\s]{3}$''``' ruler_pattern: ^\s*[^\w\s]{3}.*[^\w\s]{3}$ _help_explicit_trailing_pattern: - If a comment line matches starts with this pattern then it - is explicitly a trailing comment for the preceeding - argument. Default is '#<' explicit_trailing_pattern: '#<' _help_hashruler_min_length: - If a comment line starts with at least this many consecutive - hash characters, then don't lstrip() them off. This allows - for lazy hash rulers where the first hash char is not - separated by space hashruler_min_length: 10 _help_canonicalize_hashrulers: - If true, then insert a space between the first hash char and - remaining hash chars in a hash ruler, and normalize its - length to fill the column canonicalize_hashrulers: true _help_enable_markup: - enable comment markup parsing and reflow enable_markup: true _help_lint: Options affecting the linter lint: _help_disabled_codes: - a list of lint codes to disable disabled_codes: [] _help_function_pattern: - regular expression pattern describing valid function names function_pattern: '[0-9a-z_]+' _help_macro_pattern: - regular expression pattern describing valid macro names macro_pattern: '[0-9A-Z_]+' _help_global_var_pattern: - regular expression pattern describing valid names for - variables with global (cache) scope global_var_pattern: '[A-Z][0-9A-Z_]+' _help_internal_var_pattern: - regular expression pattern describing valid names for - variables with global scope (but internal semantic) internal_var_pattern: _[A-Z][0-9A-Z_]+ _help_local_var_pattern: - regular expression pattern describing valid names for - variables with local scope local_var_pattern: '[a-z][a-z0-9_]+' _help_private_var_pattern: - regular expression pattern describing valid names for - privatedirectory variables private_var_pattern: _[0-9a-z_]+ _help_public_var_pattern: - regular expression pattern describing valid names for public - directory variables public_var_pattern: '[A-Z][0-9A-Z_]+' _help_argument_var_pattern: - regular expression pattern describing valid names for - function/macro arguments and loop variables. argument_var_pattern: '[a-z][a-z0-9_]+' _help_keyword_pattern: - regular expression pattern describing valid names for - keywords used in functions or macros keyword_pattern: '[A-Z][0-9A-Z_]+' _help_max_conditionals_custom_parser: - In the heuristic for C0201, how many conditionals to match - within a loop in before considering the loop a parser. max_conditionals_custom_parser: 2 _help_min_statement_spacing: - Require at least this many newlines between statements min_statement_spacing: 1 _help_max_statement_spacing: - Require no more than this many newlines between statements max_statement_spacing: 2 max_returns: 6 max_branches: 12 max_arguments: 5 max_localvars: 15 max_statements: 50 _help_encode: Options affecting file encoding encode: _help_emit_byteorder_mark: - If true, emit the unicode byte-order mark (BOM) at the start - of the file emit_byteorder_mark: false _help_input_encoding: - Specify the encoding of the input file. Defaults to utf-8 input_encoding: utf-8 _help_output_encoding: - Specify the encoding of the output file. Defaults to utf-8. - Note that cmake only claims to support utf-8 so be careful - when using anything else output_encoding: utf-8 _help_misc: Miscellaneous configurations options. misc: _help_per_command: - A dictionary containing any per-command configuration - overrides. Currently only `command_case` is supported. per_command: {} ================================================ FILE: .github/ISSUE_TEMPLATE/tensorrtx-issue-template.md ================================================ --- name: tensorrtx issue template about: To understand your issue better title: '' labels: '' assignees: '' --- ## Env - GPU, e.g. V100, RTX2080, TX2, Xavier NX, Nano, etc. - OS, e.g. Ubuntu16.04, Win10, etc. - Cuda version - TensorRT version ## About this repo - which branch/tag/commit are you using? - which model? yolov5, retinaface? ## Your problem - what is your command? e.g. `sudo ./yolov5 -s` - what's your output? - what output do you expect? ================================================ FILE: .github/stale.yml ================================================ # Number of days of inactivity before an issue becomes stale daysUntilStale: 60 # Number of days of inactivity before a stale issue is closed daysUntilClose: 7 # Issues with these labels will never be considered stale exemptLabels: - pinned - security # Label to use when marking an issue as stale staleLabel: wontfix # Comment to post when marking an issue as stale. Set to `false` to disable markComment: > This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. # Comment to post when closing a stale issue. Set to `false` to disable closeComment: false ================================================ FILE: .github/workflows/pre-commit.yml ================================================ name: pre-commit on: pull_request: branches: - master - trt10 push: branches: - master - trt10 jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 with: # grab the history of the PR fetch-depth: 0 - name: Fetch commits run: | git fetch origin ${{ github.event.before }} || true git fetch origin ${{ github.sha }} - uses: actions/setup-python@v4 - uses: pre-commit/action@v3.0.1 if: github.event_name == 'push' with: extra_args: > --from-ref ${{ github.event.before }} --to-ref ${{ github.sha }} --show-diff-on-failure --color=always - uses: pre-commit/action@v3.0.1 if: github.event_name == 'pull_request' with: extra_args: > --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }} --show-diff-on-failure --color=always ================================================ FILE: .gitignore ================================================ models build *.wts *.engine *.tpymodel */*.ppm *idea* .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json !.vscode/*.code-snippets # Local History for Visual Studio Code .history/ # Built Visual Studio Code Extensions *.vsix .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json !.vscode/*.code-snippets # Local History for Visual Studio Code .history/ # Built Visual Studio Code Extensions *.vsix # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app CMakeLists.txt.user CMakeCache.txt CMakeFiles CMakeScripts Testing cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake _deps CMakeUserPresets.json ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - id: check-merge-conflict - id: check-symlinks - id: end-of-file-fixer types: [python] - id: trailing-whitespace types: [python] - id: check-added-large-files - repo: https://github.com/pre-commit/mirrors-clang-format rev: v18.1.3 hooks: - id: clang-format types_or: [c++, c, cuda] args: [-style=file] - repo: https://github.com/PyCQA/flake8 rev: 7.0.0 hooks: - id: flake8 args: [--max-line-length=120] - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: - id: cmake-format additional_dependencies: [pyyaml] args: [--in-place, -c, .cmake-format.yaml] types: [file] files: (\.cmake|CMakeLists.txt)(.in)?$ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019-2020 Wang Xinyu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # TensorRTx TensorRTx aims to implement popular deep learning networks with TensorRT network definition API. Why don't we use a parser (ONNX parser, UFF parser, caffe parser, etc), but use complex APIs to build a network from scratch? I have summarized the advantages in the following aspects. - **Flexible**, easy to modify the network, add/delete a layer or input/output tensor, replace a layer, merge layers, integrate preprocessing and postprocessing into network, etc. - **Debuggable**, construct the entire network in an incremental development manner, easy to get middle layer results. - **Educational**, learn about the network structure during this development, rather than treating everything as a black box. The basic workflow of TensorRTx is: 1. Get the trained models from pytorch, mxnet or tensorflow, etc. Some pytorch models can be found in my repo [pytorchx](https://github.com/wang-xinyu/pytorchx), the remaining are from popular open-source repos. 2. Export the weights to a plain text file -- [.wts file](./tutorials/getting_started.md#the-wts-content-format). 3. Load weights in TensorRT, define the network, build a TensorRT engine. 4. Load the TensorRT engine and run inference. ## News - `3 Mar 2026`. [zgjja](https://github.com/zgjja) Add Vision Transformer - `2 Feb 2026`. [fazligorkembal](https://github.com/fazligorkembal) Yolo26-Det, Yolo26-Obb, Yolo26-Cls - `15 Jan 2026`. [zgjja](https://github.com/zgjja) Refactor multiple old CV models to support TensorRT SDK through 7~10. - `8 Jan 2026`. [ydk61](https://github.com/ydk61): YOLOv13 - `10 May 2025`. [pranavm-nvidia](https://github.com/pranavm-nvidia): [YOLO11](./yolo11_tripy) writen in [Tripy](https://github.com/NVIDIA/TensorRT-Incubator/tree/main/tripy). - `2 May 2025`. [fazligorkembal](https://github.com/fazligorkembal): YOLO12 - `12 Apr 2025`. [pranavm-nvidia](https://github.com/pranavm-nvidia): First [Lenet](https://github.com/wang-xinyu/tensorrtx/tree/master/lenet#tripy-new-tensorrt-python-programming-model) example writen in [Tripy](https://github.com/NVIDIA/TensorRT-Incubator/tree/main/tripy). - `11 Apr 2025`. [mpj1234](https://github.com/mpj1234): [YOLO11-obb](https://github.com/wang-xinyu/tensorrtx/tree/master/yolo11) - `22 Oct 2024`. [lindsayshuo](https://github.com/lindsayshuo): YOLOv8-obb - `18 Oct 2024`. [zgjja](https://github.com/zgjja): Refactor docker image. - `11 Oct 2024`. [mpj1234](https://github.com/mpj1234): YOLO11 - `9 Oct 2024`. [Phoenix8215](https://github.com/Phoenix8215): GhostNet V1 and V2. - `21 Aug 2024`. [Lemonononon](https://github.com/Lemonononon): real-esrgan-general-x4v3 - `29 Jul 2024`. [mpj1234](https://github.com/mpj1234): Check the YOLOv5, YOLOv8 & YOLOv10 in TensorRT 10.x API, branch → [trt10](https://github.com/wang-xinyu/tensorrtx/tree/trt10) - `29 Jul 2024`. [mpj1234](https://github.com/mpj1234): YOLOv10 - `21 Jun 2024`. [WuxinrongY](https://github.com/WuxinrongY): YOLOv9-T, YOLOv9-S, YOLOv9-M - `28 Apr 2024`. [lindsayshuo](https://github.com/lindsayshuo): YOLOv8-pose - `22 Apr 2024`. [B1SH0PP](https://github.com/B1SH0PP): EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies. - `18 Apr 2024`. [lindsayshuo](https://github.com/lindsayshuo): YOLOv8-p2 ## Tutorials - [How to make contribution](./tutorials/contribution.md) - [Install the dependencies.](./tutorials/install.md) - [A guide for quickly getting started, taking lenet5 as a demo.](./tutorials/getting_started.md) - [The .wts file content format](./tutorials/getting_started.md#the-wts-content-format) - [Frequently Asked Questions (FAQ)](./tutorials/faq.md) - [Migration Guide](./tutorials/migration_guide.md) - [How to implement multi-GPU processing, taking YOLOv4 as example](./tutorials/multi_GPU_processing.md) - [Check if Your GPU support FP16/INT8](./tutorials/check_fp16_int8_support.md) - [How to Compile and Run on Windows](./tutorials/run_on_windows.md) - [Deploy YOLOv4 with Triton Inference Server](https://github.com/isarsoft/yolov4-triton-tensorrt) - [From pytorch to trt step by step, hrnet as example(Chinese)](./tutorials/from_pytorch_to_trt_stepbystep_hrnet.md) ## Test Environment 1. (**NOT recommended**) TensorRT 7.x 2. (**Recommended**)TensorRT 8.x 3. (**NOT recommended**) TensorRT 10.x ### Note 1. For history reason, some of the models are limited to specific TensorRT version, please check the README.md or code for the model you want to use. 2. Currently, TensorRT 8.x has better compatibility and the most of the features supported. ## How to run **Note**: this project support to build each network by the `CMakeLists.txt` in its subfolder, or you can build them together by the `CMakeLists.txt` on top of this project. - General procedures before building and running: ```bash # 1. generate xxx.wts from https://github.com/wang-xinyu/pytorchx/tree/master/lenet # ... # 2. put xxx.wts on top of this folder # ... ``` - (_Option 1_) To build a single subproject in this project, do: ```bash ## enter the subfolder cd tensorrtx/xxx ## configure & build cmake -S . -B build make -C build ``` - (_Option 2_) To build many subprojects, firstly, in the top `CMakeLists.txt`, **uncomment** the project you don't want to build or not suppoted by your TensorRT version, e.g., you cannot build subprojects in `${TensorRT_8_Targets}` if your TensorRT is `7.x`. Then: ```bash ## enter the top of this project cd tensorrtx ## configure & build # you may use "Ninja" rather than "make" to significantly boost the build speed cmake -G Ninja -S . -B build ninja -C build ``` **WARNING**: This part is still under development, most subprojects are not adapted yet. - run the generated executable, e.g.: ```bash # serialize model to plan file i.e. 'xxx.engine' build/xxx -s # deserialize plan file and run inference build/xxx -d # (Optional) check if the output is same as pytorchx/lenet # ... # (Optional) customize the project # ... ``` For more details, each subfolder may contain a `README.md` inside, which explains more. ## Models Following models are implemented. | Name | Description | | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [mlp](./mlp) | the very basic model for starters, properly documented | | [lenet](./lenet) | the simplest, as a "hello world" of this project | | [alexnet](./alexnet) | easy to implement, all layers are supported in tensorrt | | [googlenet](./googlenet) | GoogLeNet (Inception v1) | | [inception](./inception) | Inception v3, v4 | | [mnasnet](./mnasnet) | MNASNet with depth multiplier of 0.5 from the paper | | [mobilenet](./mobilenet) | MobileNet v2, v3-small, v3-large | | [resnet](./resnet) | resnet-18, resnet-50 and resnext50-32x4d are implemented | | [senet](./senet) | se-resnet50 | | [shufflenet](./shufflenetv2) | ShuffleNet v2 with 0.5x output channels | | [squeezenet](./squeezenet) | SqueezeNet 1.1 model | | [vgg](./vgg) | VGG 11-layer model | | [ViT](./vit) | vision transformer, using weight and model from huggingface | | [yolov3-tiny](./yolov3-tiny) | weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) | | [yolov3](./yolov3) | darknet-53, weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) | | [yolov3-spp](./yolov3-spp) | darknet-53, weights and pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) | | [yolov4](./yolov4) | CSPDarknet53, weights from [AlexeyAB/darknet](https://github.com/AlexeyAB/darknet#pre-trained-models), pytorch implementation from [ultralytics/yolov3](https://github.com/ultralytics/yolov3) | | [yolov5](./yolov5) | yolov5 v1.0-v7.0 of [ultralytics/yolov5](https://github.com/ultralytics/yolov5), detection, classification and instance segmentation | | [yolov7](./yolov7) | yolov7 v0.1, pytorch implementation from [WongKinYiu/yolov7](https://github.com/WongKinYiu/yolov7) | | [yolov8](./yolov8) | yolov8, pytorch implementation from [ultralytics](https://github.com/ultralytics/ultralytics) | | [yolov9](./yolov9) | The Pytorch implementation is [WongKinYiu/yolov9](https://github.com/WongKinYiu/yolov9). | | [yolov10](./yolov10) | The Pytorch implementation is [THU-MIG/yolov10](https://github.com/THU-MIG/yolov10). | | [yolo11](./yolo11) | The Pytorch implementation is [ultralytics](https://github.com/ultralytics/ultralytics). | | [yolo12](./yolov12) | The Pytorch implementation is [ultralytics](https://github.com/ultralytics/ultralytics). | | [yolop](./yolop) | yolop, pytorch implementation from [hustvl/YOLOP](https://github.com/hustvl/YOLOP) | | [retinaface](./retinaface) | resnet50 and mobilnet0.25, weights from [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface) | | [arcface](./arcface) | LResNet50E-IR, LResNet100E-IR and MobileFaceNet, weights from [deepinsight/insightface](https://github.com/deepinsight/insightface) | | [retinafaceAntiCov](./retinafaceAntiCov) | mobilenet0.25, weights from [deepinsight/insightface](https://github.com/deepinsight/insightface), retinaface anti-COVID-19, detect face and mask attribute | | [dbnet](./dbnet) | Scene Text Detection, weights from [BaofengZan/DBNet.pytorch](https://github.com/BaofengZan/DBNet.pytorch) | | [crnn](./crnn) | pytorch implementation from [meijieru/crnn.pytorch](https://github.com/meijieru/crnn.pytorch) | | [ufld](./ufld) | pytorch implementation from [Ultra-Fast-Lane-Detection](https://github.com/cfzd/Ultra-Fast-Lane-Detection), ECCV2020 | | [hrnet](./hrnet) | hrnet-image-classification and hrnet-semantic-segmentation, pytorch implementation from [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification) and [HRNet-Semantic-Segmentation](https://github.com/HRNet/HRNet-Semantic-Segmentation) | | [psenet](./psenet) | PSENet Text Detection, tensorflow implementation from [liuheng92/tensorflow_PSENet](https://github.com/liuheng92/tensorflow_PSENet) | | [ibnnet](./ibnnet) | IBN-Net, pytorch implementation from [XingangPan/IBN-Net](https://github.com/XingangPan/IBN-Net), ECCV2018 | | [unet](./unet) | U-Net, pytorch implementation from [milesial/Pytorch-UNet](https://github.com/milesial/Pytorch-UNet) | | [repvgg](./repvgg) | RepVGG, pytorch implementation from [DingXiaoH/RepVGG](https://github.com/DingXiaoH/RepVGG) | | [lprnet](./lprnet) | LPRNet, pytorch implementation from [xuexingyu24/License_Plate_Detection_Pytorch](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch) | | [refinedet](./refinedet) | RefineDet, pytorch implementation from [luuuyi/RefineDet.PyTorch](https://github.com/luuuyi/RefineDet.PyTorch) | | [densenet](./densenet) | DenseNet-121, from torchvision.models | | [rcnn](./rcnn) | FasterRCNN and MaskRCNN, model from [detectron2](https://github.com/facebookresearch/detectron2) | | [tsm](./tsm) | TSM: Temporal Shift Module for Efficient Video Understanding, ICCV2019 | | [scaled-yolov4](./scaled-yolov4) | yolov4-csp, pytorch from [WongKinYiu/ScaledYOLOv4](https://github.com/WongKinYiu/ScaledYOLOv4) | | [centernet](./centernet) | CenterNet DLA-34, pytorch from [xingyizhou/CenterNet](https://github.com/xingyizhou/CenterNet) | | [efficientnet](./efficientnet) | EfficientNet b0-b8 and l2, pytorch from [lukemelas/EfficientNet-PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch) | | [detr](./detr) | DE⫶TR, pytorch from [facebookresearch/detr](https://github.com/facebookresearch/detr) | | [swin-transformer](./swin-transformer) | Swin Transformer - Semantic Segmentation, only support Swin-T. The Pytorch implementation is [microsoft/Swin-Transformer](https://github.com/microsoft/Swin-Transformer.git) | | [real-esrgan](./real-esrgan) | Real-ESRGAN. The Pytorch implementation is [real-esrgan](https://github.com/xinntao/Real-ESRGAN) | | [superpoint](./superpoint) | SuperPoint. The Pytorch model is from [magicleap/SuperPointPretrainedNetwork](https://github.com/magicleap/SuperPointPretrainedNetwork) | | [csrnet](./csrnet) | CSRNet. The Pytorch implementation is [leeyeehoo/CSRNet-pytorch](https://github.com/leeyeehoo/CSRNet-pytorch) | | [EfficientAd](./efficient_ad) | EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies. From [anomalib](https://github.com/openvinotoolkit/anomalib) | ## Model Zoo The .wts files can be downloaded from model zoo for quick evaluation. But it is recommended to convert .wts from pytorch/mxnet/tensorflow model, so that you can retrain your own model. [GoogleDrive](https://drive.google.com/drive/folders/1Ri0IDa5OChtcA3zjqRTW57uG6TnfN4Do?usp=sharing) | [BaiduPan](https://pan.baidu.com/s/19s6hO8esU7-TtZEXN7G3OA) pwd: uvv2 ## Tricky Operations Some tricky operations encountered in these models, already solved, but might have better solutions. | Name | Description | | ------------------------- | ----------------------------------------------------------------------------------------------------- | | BatchNorm | Implement by a scale layer, used in resnet, googlenet, mobilenet, etc. | | MaxPool2d(ceil_mode=True) | use a padding layer before maxpool to solve ceil_mode=True, see googlenet. | | average pool with padding | use setAverageCountExcludesPadding() when necessary, see inception. | | relu6 | use `Relu6(x) = Relu(x) - Relu(x-6)`, see mobilenet. | | torch.chunk() | implement the 'chunk(2, dim=C)' by tensorrt plugin, see shufflenet. | | channel shuffle | use two shuffle layers to implement `channel_shuffle`, see shufflenet. | | adaptive pool | use fixed input dimension, and use regular average pooling, see shufflenet. | | leaky relu | I wrote a leaky relu plugin, but PRelu in `NvInferPlugin.h` can be used, see yolov3 in branch `trt4`. | | yolo layer v1 | yolo layer is implemented as a plugin, see yolov3 in branch `trt4`. | | yolo layer v2 | three yolo layers implemented in one plugin, see yolov3-spp. | | upsample | replaced by a deconvolution layer, see yolov3. | | hsigmoid | hard sigmoid is implemented as a plugin, hsigmoid and hswish are used in mobilenetv3 | | retinaface output decode | implement a plugin to decode bbox, confidence and landmarks, see retinaface. | | mish | mish activation is implemented as a plugin, mish is used in yolov4 | | prelu | mxnet's prelu activation with trainable gamma is implemented as a plugin, used in arcface | | HardSwish | hard_swish = x \* hard_sigmoid, used in yolov5 v3.0 | | LSTM | Implemented pytorch nn.LSTM() with tensorrt api | ## Speed Benchmark | Models | Device | BatchSize | Mode | Input Shape(HxW) | FPS | | ------------------------- | -------------------- | :-------: | :--: | :--------------: | :--: | | YOLOv3-tiny | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 333 | | YOLOv3(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 39.2 | | YOLOv3(darknet53) | Xeon E5-2620/GTX1080 | 1 | INT8 | 608x608 | 71.4 | | YOLOv3-spp(darknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 38.5 | | YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 35.7 | | YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 4 | FP32 | 608x608 | 40.9 | | YOLOv4(CSPDarknet53) | Xeon E5-2620/GTX1080 | 8 | FP32 | 608x608 | 41.3 | | YOLOv5-s v3.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 142 | | YOLOv5-s v3.0 | Xeon E5-2620/GTX1080 | 4 | FP32 | 608x608 | 173 | | YOLOv5-s v3.0 | Xeon E5-2620/GTX1080 | 8 | FP32 | 608x608 | 190 | | YOLOv5-m v3.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 71 | | YOLOv5-l v3.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 43 | | YOLOv5-x v3.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 29 | | YOLOv5-s v4.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 142 | | YOLOv5-m v4.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 71 | | YOLOv5-l v4.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 40 | | YOLOv5-x v4.0 | Xeon E5-2620/GTX1080 | 1 | FP32 | 608x608 | 27 | | RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | FP32 | 480x640 | 90 | | RetinaFace(resnet50) | Xeon E5-2620/GTX1080 | 1 | INT8 | 480x640 | 204 | | RetinaFace(mobilenet0.25) | Xeon E5-2620/GTX1080 | 1 | FP32 | 480x640 | 417 | | ArcFace(LResNet50E-IR) | Xeon E5-2620/GTX1080 | 1 | FP32 | 112x112 | 333 | | CRNN | Xeon E5-2620/GTX1080 | 1 | FP32 | 32x100 | 1000 | Help wanted, if you got speed results, please add an issue or PR. ## Acknowledgments & Contact Any contributions, questions and discussions are welcomed, contact me by following info. E-mail: wangxinyu_es@163.com WeChat ID: wangxinyu0375 (可加我微信进 tensorrtx 交流群,**备注:tensorrtx**) ================================================ FILE: alexnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) project( alexnet VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90 100 120) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} alexnet.cc) target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR} ${OpenCV_INCLUDE_DIRS}) target_link_libraries( ${PROJECT_NAME} PRIVATE Threads::Threads TensorRT::TensorRT CUDA::cudart ${OpenCV_LIBS}) ================================================ FILE: alexnet/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: alexnet/README.md ================================================ # alexnet ## Introduction AlexNet model architecture comes from this paper: [One weird trick for parallelizing convolutional neural networks](https://arxiv.org/abs/1404.5997). To generate `.wts` file, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet). To check the pytorch implementation of AlexNet, refer to [HERE](https://github.com/pytorch/vision/blob/main/torchvision/models/alexnet.py#L17) AlexNet consists of 3 major parts: features, adaptive average pooling, and classifier: - features: just several stacked `CRP`(conv-relu-pool) and `CR` layers - adaptive average pooling: pytorch can decide its inner parameters, but we need to calculate it ourselves in TensorRT API - classifier: just several `fc-relu` layers. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addMatrixMultiply`, `addElementWise` etc. ## Use AlexNet from PyTorch We can use torchvision to load the pretrained alexnet model: ```python alexnet = torchvision.models.alexnet(pretrained=True) ``` The model structure is: ```bash AlexNet( (features): Sequential( (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2)) (1): ReLU(inplace=True) (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)) (4): ReLU(inplace=True) (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (7): ReLU(inplace=True) (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (9): ReLU(inplace=True) (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (11): ReLU(inplace=True) (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) ) (avgpool): AdaptiveAvgPool2d(output_size=(6, 6)) (classifier): Sequential( (0): Dropout(p=0.5, inplace=False) (1): Linear(in_features=9216, out_features=4096, bias=True) (2): ReLU(inplace=True) (3): Dropout(p=0.5, inplace=False) (4): Linear(in_features=4096, out_features=4096, bias=True) (5): ReLU(inplace=True) (6): Linear(in_features=4096, out_features=1000, bias=True) ) ) ``` ## Usage 1. use `gen_wts.py` to generate wts file. ```bash python3 gen_wts.py ``` 2. build C++ code ```bash pushd tensorrtx/alexnet cmake -S . -B build -G Ninja --fresh cmake --build build ``` 3. serialize wts model to engine file. ```bash ./build/alexnet -s ``` 4. run inference ```bash ./build/alexnet -d ``` output looks like: ```txt ... ==== Execution time: 1ms 0.1234, -0.5678, ... ==== prediction result: Top: 0 idx: 285, logits: 9.9, label: Egyptian cat Top: 1 idx: 281, logits: 8.304, label: tabby, tabby cat Top: 2 idx: 282, logits: 6.859, label: tiger cat ``` ## FAQ ### How to align the output with Pytorch? If your output is different from pytorch, you have to check which TensorRT API or your code cause this. A simple solution would be check the `.engine` output part by part, e.g., you can set the early layer of alexnet as output: ```c++ fc3_1->getOutput(0)->setName(OUTPUT_NAME); network->markOutput(*pool3->getOutput(0)); // original is: "*fc3_1->getOutput(0)" ``` For this line of code, i use the output from "feature" part of alexnet, ignoring the rest of the model, then, don't forget to change the `OUTPUT_SIZE` macro on top of the file, lastly, build the `.engine` file to apply the changes. You can sum up all output from C++ code, and compare it with Pytorch output, for Pytorch, you can do this by: `torch.sum(x)` at debug phase. The ideal value deviation between 2 values would be $[10^{-1}, 10^{-2}]$, for this example, since the output elements for "feature" is $256 * 6 * 6$ (bacth = 1), the final error would roughly be $10^{-4}$. Note: This is a quick check, for more accurate check, you have to save the output tensor into a file to compare them value by value, but this situation is rare. ================================================ FILE: alexnet/alexnet.cc ================================================ #include #include #include #include #include #include "logging.h" #include "utils.h" // stuff we know about alexnet constexpr const int32_t N = 1; constexpr const int32_t INPUT_H = 224; constexpr const int32_t INPUT_W = 224; constexpr const std::array SIZES = {3ll * INPUT_H * INPUT_W, 1000}; constexpr const std::array NAMES = {"data", "prob"}; constexpr const char* ENGINE_PATH = "../models/alexnet.engine"; constexpr const char* WTS_PATH = "../models/alexnet.wts"; constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt"; static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false; static constexpr const std::array mean = {0.485f, 0.456f, 0.406f}; static constexpr const std::array stdv = {0.229f, 0.224f, 0.225f}; using WeightMap = std::map; using M = nvinfer1::MatrixOperation; using E = nvinfer1::ElementWiseOperation; using NDCF = nvinfer1::NetworkDefinitionCreationFlag; static Logger gLogger; /** * @brief Create the engine using TensorRT API and without any parser. * * @param N max batch size * @param builder * @param config * @param dt * @return ICudaEngine* */ ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { WeightMap weightMap = loadWeights(WTS_PATH); #if TRT_VERSION >= 11200 auto flag = 1U << static_cast(NDCF::kSTRONGLY_TYPED); #elif TRT_VERSION >= 10000 auto flag = 0U; #else auto flag = 1U << static_cast(NDCF::kEXPLICIT_BATCH); #endif auto* network = builder->createNetworkV2(flag); // Create input tensor ITensor* input{nullptr}; if constexpr (TRT_PREPROCESS) { dt = DataType::kUINT8; input = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3}); auto* trans = addTransformLayer(network, *input, true, mean, stdv); input = trans->getOutput(0); } else { input = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W}); } assert(input); // CRP (Conv-Relu-Pool) auto* conv1 = network->addConvolutionNd(*input, 64, DimsHW{11, 11}, weightMap["features.0.weight"], weightMap["features.0.bias"]); auto* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); auto* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(conv1 && relu1 && pool1); conv1->setStrideNd(DimsHW{4, 4}); conv1->setPaddingNd(DimsHW{2, 2}); pool1->setStrideNd(DimsHW{2, 2}); // CRP auto* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 192, DimsHW{5, 5}, weightMap["features.3.weight"], weightMap["features.3.bias"]); auto* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); auto* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(conv2 && pool2 && relu2); conv2->setPaddingNd(DimsHW{2, 2}); pool2->setStrideNd(DimsHW{2, 2}); // CR auto* conv3 = network->addConvolutionNd(*pool2->getOutput(0), 384, DimsHW{3, 3}, weightMap["features.6.weight"], weightMap["features.6.bias"]); auto* relu3 = network->addActivation(*conv3->getOutput(0), ActivationType::kRELU); assert(conv3 && relu3); conv3->setPaddingNd(DimsHW{1, 1}); // CR auto* conv4 = network->addConvolutionNd(*relu3->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.8.weight"], weightMap["features.8.bias"]); auto* relu4 = network->addActivation(*conv4->getOutput(0), ActivationType::kRELU); assert(conv4 && relu4); conv4->setPaddingNd(DimsHW{1, 1}); // CRP auto* conv5 = network->addConvolutionNd(*relu4->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.10.weight"], weightMap["features.10.bias"]); auto* relu5 = network->addActivation(*conv5->getOutput(0), ActivationType::kRELU); assert(conv5); auto* pool3 = network->addPoolingNd(*relu5->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(conv5 && relu5 && pool3); conv5->setPaddingNd(DimsHW{1, 1}); pool3->setStrideNd(DimsHW{2, 2}); // adaptive avgerage pooling auto* adaptive_pool = network->addPoolingNd(*pool3->getOutput(0), PoolingType::kAVERAGE, DimsHW{1, 1}); assert(adaptive_pool); IShuffleLayer* shuffle = network->addShuffle(*adaptive_pool->getOutput(0)); assert(shuffle); shuffle->setReshapeDimensions(Dims2{N, -1}); // "-1" means "256 * 6 * 6" // all classifier tensors int64_t in_feat = 256ll * 6 * 6; auto* fc1w = network->addConstant(DimsHW{4096, in_feat}, weightMap["classifier.1.weight"])->getOutput(0); auto* fc1b = network->addConstant(DimsHW{1, 4096}, weightMap["classifier.1.bias"])->getOutput(0); auto* fc2w = network->addConstant(DimsHW{4096, 4096}, weightMap["classifier.4.weight"])->getOutput(0); auto* fc2b = network->addConstant(DimsHW{1, 4096}, weightMap["classifier.4.bias"])->getOutput(0); auto* fc3w = network->addConstant(DimsHW{1000, 4096}, weightMap["classifier.6.weight"])->getOutput(0); auto* fc3b = network->addConstant(DimsHW{1, 1000}, weightMap["classifier.6.bias"])->getOutput(0); assert(fc1w && fc1b && fc2w && fc2b && fc3w && fc3b); // all layers in classifier auto* fc1_0 = network->addMatrixMultiply(*shuffle->getOutput(0), M::kNONE, *fc1w, M::kTRANSPOSE); auto* fc1_1 = network->addElementWise(*fc1_0->getOutput(0), *fc1b, E::kSUM); auto* relu6 = network->addActivation(*fc1_1->getOutput(0), ActivationType::kRELU); assert(fc1_0 && fc1_1 && relu6); fc1_0->setName("fc1_0"); // set name here, only for debug purpose auto* fc2_0 = network->addMatrixMultiply(*relu6->getOutput(0), M::kNONE, *fc2w, M::kTRANSPOSE); auto* fc2_1 = network->addElementWise(*fc2_0->getOutput(0), *fc2b, E::kSUM); auto* relu7 = network->addActivation(*fc2_1->getOutput(0), ActivationType::kRELU); assert(fc2_0 && fc2_1 && relu7); fc2_0->setName("fc2_0"); auto* fc3_0 = network->addMatrixMultiply(*relu7->getOutput(0), M::kNONE, *fc3w, M::kTRANSPOSE); auto* fc3_1 = network->addElementWise(*fc3_0->getOutput(0), *fc3b, E::kSUM); assert(fc3_0 && fc3_1); fc3_0->setName("fc3_0"); fc3_1->getOutput(0)->setName(NAMES[1]); network->markOutput(*fc3_1->getOutput(0)); // Build engine #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); auto* host_mem = builder->buildSerializedNetwork(*network, *config); auto* engine = runtime->deserializeCudaEngine(host_mem->data(), host_mem->size()); delete network; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); auto* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif std::cout << "build finished\n"; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } std::vector> doInference(IExecutionContext& context, const std::string& img_path, std::size_t batchSize) { static std::vector flat_img; auto img = cv::imread(img_path, cv::IMREAD_COLOR); void* input = nullptr; // use preprocess from gpu(TensorRT) or cpu(OpenCV) if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR); input = static_cast(img.data); } else { flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W); input = flat_img.data(); } assert(input); const ICudaEngine& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { #if TRT_VERSION >= 8000 auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); std::size_t size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); std::size_t size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); #endif if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batchSize * SIZES[i], std::nanf("")); std::size_t size = batchSize * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); cudaStreamDestroy(stream); for (auto i = 0; i < nIO; ++i) { CHECK(cudaFree(buffers[i])); } return prob; } int main(int argc, char** argv) { checkTrtEnv(); if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./alexnet -s // serialize model to plan file\n"; std::cerr << "./alexnet -d // deserialize plan file and run inference\n"; return -1; } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); // create a model using the API directly and serialize it to a stream char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(N, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); const std::string img_path = "../assets/cats.jpg"; for (int32_t i = 0; i < 100; ++i) { auto _start = std::chrono::system_clock::now(); auto prob = doInference(*context, img_path, N); auto _end = std::chrono::system_clock::now(); auto _time = std::chrono::duration_cast(_end - _start).count(); std::cout << "Execution time: " << _time << "ms\n"; for (const auto& vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====\n"; break; } } } if (i == 99) { std::cout << "prediction result:\n"; auto labels = loadImagenetLabelMap(LABELS_PATH); int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << labels[idx] << "\n"; } } } #if TRT_VERSION >= 8000 delete context; delete engine; delete runtime; #else context->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } ================================================ FILE: alexnet/alexnet.py ================================================ import os import sys import struct import argparse import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt BATCH_SIZE = 1 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 1000 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" WEIGHT_PATH = "./alexnet.wts" ENGINE_PATH = "./alexnet.engine" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def create_engine(max_batch_size, builder, config, dt): weight_map = load_weights(WEIGHT_PATH) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data conv1 = network.add_convolution(input=data, num_output_maps=64, kernel_shape=(11, 11), kernel=weight_map["features.0.weight"], bias=weight_map["features.0.bias"]) assert conv1 conv1.stride = (4, 4) conv1.padding = (2, 2) relu1 = network.add_activation(conv1.get_output(0), type=trt.ActivationType.RELU) assert relu1 pool1 = network.add_pooling(input=relu1.get_output(0), type=trt.PoolingType.MAX, window_size=trt.DimsHW(3, 3)) assert pool1 pool1.stride_nd = (2, 2) conv2 = network.add_convolution(input=pool1.get_output(0), num_output_maps=192, kernel_shape=(5, 5), kernel=weight_map["features.3.weight"], bias=weight_map["features.3.bias"]) assert conv2 conv2.padding = (2, 2) relu2 = network.add_activation(conv2.get_output(0), type=trt.ActivationType.RELU) assert relu2 pool2 = network.add_pooling(input=relu2.get_output(0), type=trt.PoolingType.MAX, window_size=trt.DimsHW(3, 3)) assert pool2 pool2.stride_nd = (2, 2) conv3 = network.add_convolution(input=pool2.get_output(0), num_output_maps=384, kernel_shape=(3, 3), kernel=weight_map["features.6.weight"], bias=weight_map["features.6.bias"]) assert conv3 conv3.padding = (1, 1) relu3 = network.add_activation(conv3.get_output(0), type=trt.ActivationType.RELU) assert relu3 conv4 = network.add_convolution(input=relu3.get_output(0), num_output_maps=256, kernel_shape=(3, 3), kernel=weight_map["features.8.weight"], bias=weight_map["features.8.bias"]) assert conv4 conv4.padding = (1, 1) relu4 = network.add_activation(conv4.get_output(0), type=trt.ActivationType.RELU) assert relu4 conv5 = network.add_convolution(input=relu4.get_output(0), num_output_maps=256, kernel_shape=(3, 3), kernel=weight_map["features.10.weight"], bias=weight_map["features.10.bias"]) assert conv5 conv5.padding = (1, 1) relu5 = network.add_activation(conv5.get_output(0), type=trt.ActivationType.RELU) assert relu5 pool3 = network.add_pooling(input=relu5.get_output(0), type=trt.PoolingType.MAX, window_size=trt.DimsHW(3, 3)) assert pool3 pool3.stride_nd = (2, 2) fc1 = network.add_fully_connected(input=pool3.get_output(0), num_outputs=4096, kernel=weight_map["classifier.1.weight"], bias=weight_map["classifier.1.bias"]) assert fc1 relu6 = network.add_activation(fc1.get_output(0), type=trt.ActivationType.RELU) assert relu6 fc2 = network.add_fully_connected(input=relu6.get_output(0), num_outputs=4096, kernel=weight_map["classifier.4.weight"], bias=weight_map["classifier.4.bias"]) assert fc2 relu7 = network.add_activation(fc2.get_output(0), type=trt.ActivationType.RELU) assert relu7 fc3 = network.add_fully_connected(input=relu7.get_output(0), num_outputs=1000, kernel=weight_map["classifier.6.weight"], bias=weight_map["classifier.6.bias"]) assert fc3 fc3.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(fc3.get_output(0)) # Build Engine builder.max_batch_size = max_batch_size builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network del weight_map return engine def API_to_model(max_batch_size): builder = trt.Builder(TRT_LOGGER) config = builder.create_builder_config() engine = create_engine(max_batch_size, builder, config, trt.float32) assert engine with open(ENGINE_PATH, "wb") as f: f.write(engine.serialize()) del engine del builder del config class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') args = parser.parse_args() if not (args.s ^ args.d): print( "arguments not right!\n" "python alexnet.py -s # serialize model to plan file\n" "python alexnet.py -d # deserialize plan file and run inference" ) sys.exit() if args.s: API_to_model(BATCH_SIZE) else: runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32) inputs, outputs, bindings, stream = allocate_buffers(engine) inputs[0].host = data trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}') ================================================ FILE: alexnet/gen_wts.py ================================================ import struct import cv2 import numpy as np import torch from torchvision.models import alexnet def read_imagenet_labels() -> dict[int, str]: """ read ImageNet 1000 labels Returns: dict[int, str]: labels dict """ clsid2label = {} with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f: for i in f.readlines(): k, v = i.split(": ") clsid2label.setdefault(int(k), v[1:-3]) return clsid2label def preprocess(img: np.array) -> torch.Tensor: """ a preprocess method align with ImageNet dataset Args: img (np.array): input image Returns: torch.Tensor: preprocessed image in `NCHW` layout """ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR) mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) img = (img - mean) / std img = img.transpose(2, 0, 1)[None, ...] return torch.from_numpy(img) if __name__ == "__main__": img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR) img = preprocess(img) model = alexnet(pretrained=True) model.eval() output = model(img) labels = read_imagenet_labels() for batch in torch.topk(output, k=3).indices: for i, j in enumerate(batch, 1): print(f"top: {i:<2}, confidence: {float(output[0, j]):.4f}, label: {labels[int(j)]}") print("writing alexnet wts") with open("../models/alexnet.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): print(f"key: {k}\tvalue: {v.shape}") vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") ================================================ FILE: alexnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: alexnet/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: alexnet/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static auto loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean subtract mean * @param std divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in float32 type */ static std::vector preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array& mean, const std::array& std, int n, int h, int w) { const auto c = img.channels(); const auto size = c * h * w; if (c != 3) { std::cerr << "this demo only supports 3 channel input image.\n"; std::abort(); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); img.convertTo(img, CV_32FC3, 1.f / 255); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(static_cast(n) * c * h * w, 0.f); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = v[0]; chw[i * size + 1 * h * w + y * w + x] = v[1]; chw[i * size + 2 * h * w + y * w + x] = v[2]; } } } return chw; } static auto topk(const std::vector& v, int k) -> std::vector> { if (k <= 0) return {}; auto stride = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(stride); for (auto i = 0; i < stride; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static std::map loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const std::array& mean, const std::array& std) { struct ScaleParams { std::array shift; std::array scale; }; static std::vector> gScaleParams; auto params = std::make_unique(); params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]}; params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)}; static const Weights empty{DataType::kFLOAT, nullptr, 0ll}; const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll}; const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll}; gScaleParams.emplace_back(std::move(params)); ITensor* in = &input; if (input.getType() != DataType::kFLOAT) { #if TRT_VERSION >= 8000 auto* cast = network->addCast(input, DataType::kFLOAT); assert(cast); cast->setName("Cast to FP32"); in = cast->getOutput(0); #else auto* identity = network->addIdentity(input); assert(identity); identity->setName("Convert to FP32"); identity->setOutputType(0, DataType::kFLOAT); in = identity->getOutput(0); #endif } // Convert from NHWC to NCHW auto* perm = network->addShuffle(*in); assert(perm); perm->setName("NHWC -> NCHW"); perm->setFirstTranspose(Permutation{0, 3, 1, 2}); // Convert from BGR to RGB (optional) ITensor* data{nullptr}; if (bgr2rgb) { auto add_slice = [&](int c, const char* name) -> ITensor* { auto dims = perm->getOutput(0)->getDimensions(); Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1}; Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]}; auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride); _slice->setName(name); assert(_slice && _slice->getNbOutputs() == 1); return _slice->getOutput(0); }; std::array channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")}; auto* cat = network->addConcatenation(channels.data(), 3); assert(cat); cat->setName("RGB"); cat->setAxis(1); data = cat->getOutput(0); } else { data = perm->getOutput(0); } // Normalize auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty); assert(trans); trans->setName("mean & std"); #if TRT_VERSION >= 8000 trans->setChannelAxis(1); #endif return trans; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: arcface/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(arcface) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/prelu.cu) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(arcface-r50 ${PROJECT_SOURCE_DIR}/arcface-r50.cpp) target_link_libraries(arcface-r50 nvinfer) target_link_libraries(arcface-r50 cudart) target_link_libraries(arcface-r50 myplugins) target_link_libraries(arcface-r50 ${OpenCV_LIBS}) add_executable(arcface-mobilefacenet ${PROJECT_SOURCE_DIR}/arcface-mobilefacenet.cpp) target_link_libraries(arcface-mobilefacenet nvinfer) target_link_libraries(arcface-mobilefacenet cudart) target_link_libraries(arcface-mobilefacenet myplugins) target_link_libraries(arcface-mobilefacenet ${OpenCV_LIBS}) add_executable(arcface-r100 ${PROJECT_SOURCE_DIR}/arcface-r100.cpp) target_link_libraries(arcface-r100 nvinfer) target_link_libraries(arcface-r100 cudart) target_link_libraries(arcface-r100 myplugins) target_link_libraries(arcface-r100 ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: arcface/README.md ================================================ # arcface ### TensortRT 8 The mxnet implementation is from [deepinsight/insightface.](https://github.com/deepinsight/insightface) **Updated Pretrained Weights:** ArcFace-R100 [Insight Face Google Drive](https://drive.google.com/file/d/1Hc5zUfBATaXUgcU2haUNa7dcaZSw95h2/view) --- **Previous Pre-trained models:** The pretrained models are from [LResNet50E-IR,ArcFace@ms1m-refine-v1](https://github.com/deepinsight/insightface/wiki/Model-Zoo#32-lresnet50e-irarcfacems1m-refine-v1), [LResNet100E-IR,ArcFace@ms1m-refine-v2](https://github.com/deepinsight/insightface/wiki/Model-Zoo#31-lresnet100e-irarcfacems1m-refine-v2) and [MobileFaceNet,ArcFace@ms1m-refine-v1](https://github.com/deepinsight/insightface/wiki/Model-Zoo#34-mobilefacenetarcfacems1m-refine-v1) --- The two input images used in this project are joey0.ppm and joey1.ppm, download them from [Google Drive.](https://drive.google.com/drive/folders/1ctqpkRCRKyBZRCNwo9Uq4eUoMRLtFq1e). The input image is 112x112, and generated from `get_input()` in `insightface/deploy/face_model.py`, which is cropped and aligned face image.

## Config - FP16/FP32 can be selected by the macro `USE_FP16` in arcface-r50/r100/mobilefacenet.cpp - GPU id can be selected by the macro `DEVICE` in arcface-r50/r100/mobilefacenet.cpp ## Run 1.Generate .wts file from mxnet implementation of pretrained model. The following example described how to generate arcface-r100.wts from mxnet implementation of LResNet100E-IR,ArcFace@ms1m-refine-v1. ``` git clone https://github.com/deepinsight/insightface cd insightface git checkout 3866cd77a6896c934b51ed39e9651b791d78bb57 cd deploy // copy tensorrtx/arcface/gen_wts.py to here(insightface/deploy) // download model-r100-ii.zip and unzip here(insightface/deploy) python gen_wts.py // a file 'arcface-r100.wts' will be generated. // the master branch of insightface should work, if not, you can checkout 94ad870abb3203d6f31b049b70dd080dc8f33fca // arcface-r50.wts/arcface-mobilefacenet.wts can be generated in similar way from mxnet implementation of LResNet50E-IR,ArcFace@ms1m-refine-v1/MobileFaceNet,ArcFace@ms1m-refine-v1 pretrained model. ``` 2.Put .wts file into tensorrtx/arcface, build and run ``` cd tensorrtx/arcface // download joey0.ppm and joey1.ppm, and put here(tensorrtx/arcface) mkdir build cd build cmake .. make sudo ./arcface-r100 -s // serialize model to plan file i.e. 'arcface-r100.engine' sudo ./arcface-r100 -d // deserialize plan file and run inference or sudo ./arcface-r50 -s // serialize model to plan file i.e. 'arcface-r50.engine' sudo ./arcface-r50 -d // deserialize plan file and run inference or sudo ./arcface-mobilefacenet -s // serialize model to plan file i.e. 'arcface-mobilefacenet.engine' sudo ./arcface-mobilefacenet -d // deserialize plan file and run inference ``` 3.Check the output log, latency and similarity score. ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: arcface/arcface-mobilefacenet.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) //#define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // currently, only support BATCH=1 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = 112; static const int INPUT_W = 112; static const int OUTPUT_SIZE = 128; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + "_gamma"].values; float *beta = (float*)weightMap[lname + "_beta"].values; float *mean = (float*)weightMap[lname + "_moving_mean"].values; float *var = (float*)weightMap[lname + "_moving_var"].values; int len = weightMap[lname + "_moving_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* addPRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { float *gamma = (float*)weightMap[lname + "_gamma"].values; int len = weightMap[lname + "_gamma"].count; float *scval_1 = reinterpret_cast(malloc(sizeof(float) * len)); float *scval_2 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval_1[i] = -1.0; scval_2[i] = -gamma[i]; } Weights scale_1{ DataType::kFLOAT, scval_1, len }; Weights scale_2{ DataType::kFLOAT, scval_2, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = 0.0; } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; auto relu1 = network->addActivation(input, ActivationType::kRELU); assert(relu1); IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power); assert(scale1); auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU); assert(relu2); IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power); assert(scale2); IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); return ew1; } ILayer* conv_bn_relu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int k = 3, int p = 1, int s = 2, int groups=1) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(groups); auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_batchnorm", 1e-3); assert(bn1); auto act1 = addPRelu(network, weightMap, *bn1->getOutput(0), lname + "_relu"); assert(act1); return act1; } ILayer* conv_bn(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int k = 3, int p = 1, int s = 1, int groups=1) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(groups); auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_batchnorm", 1e-3); assert(bn1); return bn1; } ILayer* DepthWise(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int inp, int oup, int groups, int s) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, groups, DimsHW{1, 1}, weightMap[lname + "_conv_sep_conv2d_weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{1, 1}); conv1->setPaddingNd(DimsHW{0, 0}); conv1->setNbGroups(1); auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_conv_sep_batchnorm", 1e-3); assert(bn1); auto act1 = addPRelu(network, weightMap, *bn1->getOutput(0), lname + "_conv_sep_relu"); assert(act1); IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), groups, DimsHW{3, 3}, weightMap[lname + "_conv_dw_conv2d_weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{s, s}); conv2->setPaddingNd(DimsHW{1, 1}); conv2->setNbGroups(groups); auto bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_conv_dw_batchnorm", 1e-3); assert(bn2); auto act2 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_conv_dw_relu"); assert(act2); IConvolutionLayer* conv3 = network->addConvolutionNd(*act2->getOutput(0), oup, DimsHW{1, 1}, weightMap[lname + "_conv_proj_conv2d_weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{1, 1}); conv3->setPaddingNd(DimsHW{0, 0}); conv3->setNbGroups(1); auto bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "_conv_proj_batchnorm", 1e-3); assert(bn3); return bn3; } ILayer* DWResidual(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int inp, int oup, int groups, int s) { auto dw1 = DepthWise(network, weightMap, input, lname, inp, oup, groups, s); IElementWiseLayer* ew1; ew1 = network->addElementWise(input, *dw1->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); return ew1; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../arcface-mobilefacenet.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto conv_1 = conv_bn_relu(network, weightMap, *data, "conv_1", 64, 3, 1, 2); auto conv_2_dw = conv_bn_relu(network, weightMap, *conv_1->getOutput(0), "conv_2_dw", 64, 3, 1, 1, 64); auto conv_23 = DepthWise(network, weightMap, *conv_2_dw->getOutput(0), "dconv_23", 64, 64, 128, 2); auto res_3_block0 = DWResidual(network, weightMap, *conv_23->getOutput(0), "res_3_block0", 64, 64, 128, 1); auto res_3_block1 = DWResidual(network, weightMap, *res_3_block0->getOutput(0), "res_3_block1", 64, 64, 128, 1); auto res_3_block2 = DWResidual(network, weightMap, *res_3_block1->getOutput(0), "res_3_block2", 64, 64, 128, 1); auto res_3_block3 = DWResidual(network, weightMap, *res_3_block2->getOutput(0), "res_3_block3", 64, 64, 128, 1); auto conv_34 = DepthWise(network, weightMap, *res_3_block3->getOutput(0), "dconv_34", 64, 128, 256, 2); auto res_4_block0 = DWResidual(network, weightMap, *conv_34->getOutput(0), "res_4_block0", 128, 128, 256, 1); auto res_4_block1 = DWResidual(network, weightMap, *res_4_block0->getOutput(0), "res_4_block1", 128, 128, 256, 1); auto res_4_block2 = DWResidual(network, weightMap, *res_4_block1->getOutput(0), "res_4_block2", 128, 128, 256, 1); auto res_4_block3 = DWResidual(network, weightMap, *res_4_block2->getOutput(0), "res_4_block3", 128, 128, 256, 1); auto res_4_block4 = DWResidual(network, weightMap, *res_4_block3->getOutput(0), "res_4_block4", 128, 128, 256, 1); auto res_4_block5 = DWResidual(network, weightMap, *res_4_block4->getOutput(0), "res_4_block5", 128, 128, 256, 1); auto conv_45 = DepthWise(network, weightMap, *res_4_block5->getOutput(0), "dconv_45", 128, 128, 512, 2); auto res_5_block0 = DWResidual(network, weightMap, *conv_45->getOutput(0), "res_5_block0", 128, 128, 256, 1); auto res_5_block1 = DWResidual(network, weightMap, *res_5_block0->getOutput(0), "res_5_block1", 128, 128, 256, 1); auto conv_6_sep = conv_bn_relu(network, weightMap, *res_5_block1->getOutput(0), "conv_6sep", 512, 1, 0, 1); auto conv_6dw7_7 = conv_bn(network, weightMap, *conv_6_sep->getOutput(0), "conv_6dw7_7", 512, 7, 0, 1, 512); IFullyConnectedLayer* fc1 = network->addFullyConnected(*conv_6dw7_7->getOutput(0), 128, weightMap["fc1_weight"], weightMap["pre_fc1_bias"]); assert(fc1); auto bn1 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5); assert(bn1); bn1->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*bn1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("arcface-mobilefacenet.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { std::ifstream file("arcface-mobilefacenet.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./arcface-mobilefacenet -s // serialize model to plan file" << std::endl; std::cerr << "./arcface-mobilefacenet -d // deserialize plan file and run inference" << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; cv::Mat img = cv::imread("../joey0.ppm"); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat out(128, 1, CV_32FC1, prob); cv::Mat out_norm; cv::normalize(out, out_norm); img = cv::imread("../joey1.ppm"); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; } // Run inference start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat out1(1, 128, CV_32FC1, prob); cv::Mat out_norm1; cv::normalize(out1, out_norm1); cv::Mat res = out_norm1 * out_norm; std::cout << "similarity score: " << *(float*)res.data << std::endl; // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); //Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << p_out_norm[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: arcface/arcface-r100.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) //#define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // currently, only support BATCH=1 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = 112; static const int INPUT_W = 112; static const int OUTPUT_SIZE = 512; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + "_gamma"].values; float *beta = (float*)weightMap[lname + "_beta"].values; float *mean = (float*)weightMap[lname + "_moving_mean"].values; float *var = (float*)weightMap[lname + "_moving_var"].values; int len = weightMap[lname + "_moving_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* addPRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { float *gamma = (float*)weightMap[lname + "_gamma"].values; int len = weightMap[lname + "_gamma"].count; float *scval_1 = reinterpret_cast(malloc(sizeof(float) * len)); float *scval_2 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval_1[i] = -1.0; scval_2[i] = -gamma[i]; } Weights scale_1{ DataType::kFLOAT, scval_1, len }; Weights scale_2{ DataType::kFLOAT, scval_2, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = 0.0; } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; auto relu1 = network->addActivation(input, ActivationType::kRELU); assert(relu1); IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power); assert(scale1); auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU); assert(relu2); IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power); assert(scale2); IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); return ew1; } ILayer* resUnit(INetworkDefinition *network, std::map& weightMap, ITensor& input, int num_filters, int s, bool dim_match, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto bn1 = addBatchNorm2d(network, weightMap, input, lname + "_bn1", 2e-5); IConvolutionLayer* conv1 = network->addConvolutionNd(*bn1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], emptywts); assert(conv1); conv1->setPaddingNd(DimsHW{1, 1}); auto bn2 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_bn2", 2e-5); auto act1 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_relu1"); IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv2_weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{s, s}); conv2->setPaddingNd(DimsHW{1, 1}); auto bn3 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_bn3", 2e-5); IElementWiseLayer* ew1; if (dim_match) { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { IConvolutionLayer* conv1sc = network->addConvolutionNd(input, num_filters, DimsHW{1, 1}, weightMap[lname + "_conv1sc_weight"], emptywts); assert(conv1sc); conv1sc->setStrideNd(DimsHW{s, s}); auto bn1sc = addBatchNorm2d(network, weightMap, *conv1sc->getOutput(0), lname + "_sc", 2e-5); ew1 = network->addElementWise(*bn1sc->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } assert(ew1); return ew1; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../arcface-r100.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv0_weight"], emptywts); assert(conv0); conv0->setPaddingNd(DimsHW{1, 1}); auto bn0 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), "bn0", 2e-5); auto relu0 = addPRelu(network, weightMap, *bn0->getOutput(0), "relu0"); auto s1u1 = resUnit(network, weightMap, *relu0->getOutput(0), 64, 2, false, "stage1_unit1"); auto s1u2 = resUnit(network, weightMap, *s1u1->getOutput(0), 64, 1, true, "stage1_unit2"); auto s1u3 = resUnit(network, weightMap, *s1u2->getOutput(0), 64, 1, true, "stage1_unit3"); auto s2u1 = resUnit(network, weightMap, *s1u3->getOutput(0), 128, 2, false, "stage2_unit1"); auto s2u2 = resUnit(network, weightMap, *s2u1->getOutput(0), 128, 1, true, "stage2_unit2"); auto s2u3 = resUnit(network, weightMap, *s2u2->getOutput(0), 128, 1, true, "stage2_unit3"); auto s2u4 = resUnit(network, weightMap, *s2u3->getOutput(0), 128, 1, true, "stage2_unit4"); auto s2u5 = resUnit(network, weightMap, *s2u4->getOutput(0), 128, 1, true, "stage2_unit5"); auto s2u6 = resUnit(network, weightMap, *s2u5->getOutput(0), 128, 1, true, "stage2_unit6"); auto s2u7 = resUnit(network, weightMap, *s2u6->getOutput(0), 128, 1, true, "stage2_unit7"); auto s2u8 = resUnit(network, weightMap, *s2u7->getOutput(0), 128, 1, true, "stage2_unit8"); auto s2u9 = resUnit(network, weightMap, *s2u8->getOutput(0), 128, 1, true, "stage2_unit9"); auto s2u10 = resUnit(network, weightMap, *s2u9->getOutput(0), 128, 1, true, "stage2_unit10"); auto s2u11 = resUnit(network, weightMap, *s2u10->getOutput(0), 128, 1, true, "stage2_unit11"); auto s2u12 = resUnit(network, weightMap, *s2u11->getOutput(0), 128, 1, true, "stage2_unit12"); auto s2u13 = resUnit(network, weightMap, *s2u12->getOutput(0), 128, 1, true, "stage2_unit13"); auto s3u1 = resUnit(network, weightMap, *s2u13->getOutput(0), 256, 2, false, "stage3_unit1"); auto s3u2 = resUnit(network, weightMap, *s3u1->getOutput(0), 256, 1, true, "stage3_unit2"); auto s3u3 = resUnit(network, weightMap, *s3u2->getOutput(0), 256, 1, true, "stage3_unit3"); auto s3u4 = resUnit(network, weightMap, *s3u3->getOutput(0), 256, 1, true, "stage3_unit4"); auto s3u5 = resUnit(network, weightMap, *s3u4->getOutput(0), 256, 1, true, "stage3_unit5"); auto s3u6 = resUnit(network, weightMap, *s3u5->getOutput(0), 256, 1, true, "stage3_unit6"); auto s3u7 = resUnit(network, weightMap, *s3u6->getOutput(0), 256, 1, true, "stage3_unit7"); auto s3u8 = resUnit(network, weightMap, *s3u7->getOutput(0), 256, 1, true, "stage3_unit8"); auto s3u9 = resUnit(network, weightMap, *s3u8->getOutput(0), 256, 1, true, "stage3_unit9"); auto s3u10 = resUnit(network, weightMap, *s3u9->getOutput(0), 256, 1, true, "stage3_unit10"); auto s3u11 = resUnit(network, weightMap, *s3u10->getOutput(0), 256, 1, true, "stage3_unit11"); auto s3u12 = resUnit(network, weightMap, *s3u11->getOutput(0), 256, 1, true, "stage3_unit12"); auto s3u13 = resUnit(network, weightMap, *s3u12->getOutput(0), 256, 1, true, "stage3_unit13"); auto s3u14 = resUnit(network, weightMap, *s3u13->getOutput(0), 256, 1, true, "stage3_unit14"); auto s3u15 = resUnit(network, weightMap, *s3u14->getOutput(0), 256, 1, true, "stage3_unit15"); auto s3u16 = resUnit(network, weightMap, *s3u15->getOutput(0), 256, 1, true, "stage3_unit16"); auto s3u17 = resUnit(network, weightMap, *s3u16->getOutput(0), 256, 1, true, "stage3_unit17"); auto s3u18 = resUnit(network, weightMap, *s3u17->getOutput(0), 256, 1, true, "stage3_unit18"); auto s3u19 = resUnit(network, weightMap, *s3u18->getOutput(0), 256, 1, true, "stage3_unit19"); auto s3u20 = resUnit(network, weightMap, *s3u19->getOutput(0), 256, 1, true, "stage3_unit20"); auto s3u21 = resUnit(network, weightMap, *s3u20->getOutput(0), 256, 1, true, "stage3_unit21"); auto s3u22 = resUnit(network, weightMap, *s3u21->getOutput(0), 256, 1, true, "stage3_unit22"); auto s3u23 = resUnit(network, weightMap, *s3u22->getOutput(0), 256, 1, true, "stage3_unit23"); auto s3u24 = resUnit(network, weightMap, *s3u23->getOutput(0), 256, 1, true, "stage3_unit24"); auto s3u25 = resUnit(network, weightMap, *s3u24->getOutput(0), 256, 1, true, "stage3_unit25"); auto s3u26 = resUnit(network, weightMap, *s3u25->getOutput(0), 256, 1, true, "stage3_unit26"); auto s3u27 = resUnit(network, weightMap, *s3u26->getOutput(0), 256, 1, true, "stage3_unit27"); auto s3u28 = resUnit(network, weightMap, *s3u27->getOutput(0), 256, 1, true, "stage3_unit28"); auto s3u29 = resUnit(network, weightMap, *s3u28->getOutput(0), 256, 1, true, "stage3_unit29"); auto s3u30 = resUnit(network, weightMap, *s3u29->getOutput(0), 256, 1, true, "stage3_unit30"); auto s4u1 = resUnit(network, weightMap, *s3u30->getOutput(0), 512, 2, false, "stage4_unit1"); auto s4u2 = resUnit(network, weightMap, *s4u1->getOutput(0), 512, 1, true, "stage4_unit2"); auto s4u3 = resUnit(network, weightMap, *s4u2->getOutput(0), 512, 1, true, "stage4_unit3"); auto bn1 = addBatchNorm2d(network, weightMap, *s4u3->getOutput(0), "bn1", 2e-5); IFullyConnectedLayer* fc1 = network->addFullyConnected(*bn1->getOutput(0), 512, weightMap["pre_fc1_weight"], weightMap["pre_fc1_bias"]); assert(fc1); auto bn2 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5); bn2->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*bn2->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(256, &modelStream); assert(modelStream != nullptr); std::ofstream p("arcface-r100.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { std::ifstream file("arcface-r100.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./arcface-r100 -s // serialize model to plan file" << std::endl; std::cerr << "./arcface-r100 -d // deserialize plan file and run inference" << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; cv::Mat img = cv::imread("../joey0.ppm"); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat out(512, 1, CV_32FC1, prob); cv::Mat out_norm; cv::normalize(out, out_norm); img = cv::imread("../joey1.ppm"); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; } // Run inference start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat out1(1, 512, CV_32FC1, prob); cv::Mat out_norm1; cv::normalize(out1, out_norm1); cv::Mat res = out_norm1 * out_norm; std::cout << "similarity score: " << *(float*)res.data << std::endl; // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); //Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << p_out_norm[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: arcface/arcface-r50.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) //#define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // currently, only support BATCH=1 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = 112; static const int INPUT_W = 112; static const int OUTPUT_SIZE = 512; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + "_gamma"].values; float *beta = (float*)weightMap[lname + "_beta"].values; float *mean = (float*)weightMap[lname + "_moving_mean"].values; float *var = (float*)weightMap[lname + "_moving_var"].values; int len = weightMap[lname + "_moving_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* addPRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { float *gamma = (float*)weightMap[lname + "_gamma"].values; int len = weightMap[lname + "_gamma"].count; float *scval_1 = reinterpret_cast(malloc(sizeof(float) * len)); float *scval_2 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval_1[i] = -1.0; scval_2[i] = -gamma[i]; } Weights scale_1{ DataType::kFLOAT, scval_1, len }; Weights scale_2{ DataType::kFLOAT, scval_2, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = 0.0; } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; auto relu1 = network->addActivation(input, ActivationType::kRELU); assert(relu1); IScaleLayer* scale1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale_1, power); assert(scale1); auto relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU); assert(relu2); IScaleLayer* scale2 = network->addScale(*relu2->getOutput(0), ScaleMode::kCHANNEL, shift, scale_2, power); assert(scale2); IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); return ew1; } ILayer* resUnit(INetworkDefinition *network, std::map& weightMap, ITensor& input, int num_filters, int s, bool dim_match, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto bn1 = addBatchNorm2d(network, weightMap, input, lname + "_bn1", 2e-5); IConvolutionLayer* conv1 = network->addConvolutionNd(*bn1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], emptywts); assert(conv1); conv1->setPaddingNd(DimsHW{1, 1}); auto bn2 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_bn2", 2e-5); auto act1 = addPRelu(network, weightMap, *bn2->getOutput(0), lname + "_relu1"); IConvolutionLayer* conv2 = network->addConvolutionNd(*act1->getOutput(0), num_filters, DimsHW{3, 3}, weightMap[lname + "_conv2_weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{s, s}); conv2->setPaddingNd(DimsHW{1, 1}); auto bn3 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "_bn3", 2e-5); IElementWiseLayer* ew1; if (dim_match) { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { IConvolutionLayer* conv1sc = network->addConvolutionNd(input, num_filters, DimsHW{1, 1}, weightMap[lname + "_conv1sc_weight"], emptywts); assert(conv1sc); conv1sc->setStrideNd(DimsHW{s, s}); auto bn1sc = addBatchNorm2d(network, weightMap, *conv1sc->getOutput(0), lname + "_sc", 2e-5); ew1 = network->addElementWise(*bn1sc->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } assert(ew1); return ew1; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../arcface-r50.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv0_weight"], emptywts); assert(conv0); conv0->setPaddingNd(DimsHW{1, 1}); auto bn0 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), "bn0", 2e-5); auto relu0 = addPRelu(network, weightMap, *bn0->getOutput(0), "relu0"); auto s1u1 = resUnit(network, weightMap, *relu0->getOutput(0), 64, 2, false, "stage1_unit1"); auto s1u2 = resUnit(network, weightMap, *s1u1->getOutput(0), 64, 1, true, "stage1_unit2"); auto s1u3 = resUnit(network, weightMap, *s1u2->getOutput(0), 64, 1, true, "stage1_unit3"); auto s2u1 = resUnit(network, weightMap, *s1u3->getOutput(0), 128, 2, false, "stage2_unit1"); auto s2u2 = resUnit(network, weightMap, *s2u1->getOutput(0), 128, 1, true, "stage2_unit2"); auto s2u3 = resUnit(network, weightMap, *s2u2->getOutput(0), 128, 1, true, "stage2_unit3"); auto s2u4 = resUnit(network, weightMap, *s2u3->getOutput(0), 128, 1, true, "stage2_unit4"); auto s3u1 = resUnit(network, weightMap, *s2u4->getOutput(0), 256, 2, false, "stage3_unit1"); auto s3u2 = resUnit(network, weightMap, *s3u1->getOutput(0), 256, 1, true, "stage3_unit2"); auto s3u3 = resUnit(network, weightMap, *s3u2->getOutput(0), 256, 1, true, "stage3_unit3"); auto s3u4 = resUnit(network, weightMap, *s3u3->getOutput(0), 256, 1, true, "stage3_unit4"); auto s3u5 = resUnit(network, weightMap, *s3u4->getOutput(0), 256, 1, true, "stage3_unit5"); auto s3u6 = resUnit(network, weightMap, *s3u5->getOutput(0), 256, 1, true, "stage3_unit6"); auto s3u7 = resUnit(network, weightMap, *s3u6->getOutput(0), 256, 1, true, "stage3_unit7"); auto s3u8 = resUnit(network, weightMap, *s3u7->getOutput(0), 256, 1, true, "stage3_unit8"); auto s3u9 = resUnit(network, weightMap, *s3u8->getOutput(0), 256, 1, true, "stage3_unit9"); auto s3u10 = resUnit(network, weightMap, *s3u9->getOutput(0), 256, 1, true, "stage3_unit10"); auto s3u11 = resUnit(network, weightMap, *s3u10->getOutput(0), 256, 1, true, "stage3_unit11"); auto s3u12 = resUnit(network, weightMap, *s3u11->getOutput(0), 256, 1, true, "stage3_unit12"); auto s3u13 = resUnit(network, weightMap, *s3u12->getOutput(0), 256, 1, true, "stage3_unit13"); auto s3u14 = resUnit(network, weightMap, *s3u13->getOutput(0), 256, 1, true, "stage3_unit14"); auto s4u1 = resUnit(network, weightMap, *s3u14->getOutput(0), 512, 2, false, "stage4_unit1"); auto s4u2 = resUnit(network, weightMap, *s4u1->getOutput(0), 512, 1, true, "stage4_unit2"); auto s4u3 = resUnit(network, weightMap, *s4u2->getOutput(0), 512, 1, true, "stage4_unit3"); auto bn1 = addBatchNorm2d(network, weightMap, *s4u3->getOutput(0), "bn1", 2e-5); IFullyConnectedLayer* fc1 = network->addFullyConnected(*bn1->getOutput(0), 512, weightMap["pre_fc1_weight"], weightMap["pre_fc1_bias"]); assert(fc1); auto bn2 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5); bn2->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*bn2->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("arcface-r50.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { std::ifstream file("arcface-r50.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./arcface-r50 -s // serialize model to plan file" << std::endl; std::cerr << "./arcface-r50 -d // deserialize plan file and run inference" << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; cv::Mat img = cv::imread("../joey0.ppm"); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat out(512, 1, CV_32FC1, prob); cv::Mat out_norm; cv::normalize(out, out_norm); img = cv::imread("../joey1.ppm"); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)img.at(i)[0] - 127.5) * 0.0078125; } // Run inference start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat out1(1, 512, CV_32FC1, prob); cv::Mat out_norm1; cv::normalize(out1, out_norm1); cv::Mat res = out_norm1 * out_norm; std::cout << "similarity score: " << *(float*)res.data << std::endl; // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); //Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << p_out_norm[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: arcface/gen_wts.py ================================================ import struct import sys import argparse import face_model import cv2 import numpy as np parser = argparse.ArgumentParser(description='face model test') # general parser.add_argument('--image-size', default='112,112', help='') parser.add_argument('--model', default='model-r100-ii/model,0', help='path to load model.') parser.add_argument('--ga-model', default='', help='path to load model.') parser.add_argument('--gpu', default=0, type=int, help='gpu id') parser.add_argument('--det', default=0, type=int, help='mtcnn option, 1 means using R+O, 0 means detect from begining') parser.add_argument('--flip', default=0, type=int, help='whether do lr flip aug') parser.add_argument('--threshold', default=1.24, type=float, help='ver dist threshold') args = parser.parse_args() model = face_model.FaceModel(args) f = open('arcface-r100.wts', 'w') f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys()))) for k, v in model.model.get_params()[0].items(): vr = v.reshape(-1).asnumpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') for k, v in model.model.get_params()[1].items(): vr = v.reshape(-1).asnumpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') ================================================ FILE: arcface/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #else #define TRT_NOEXCEPT #endif using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: arcface/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: arcface/prelu.cu ================================================ #include #include #include #include #include "prelu.h" namespace nvinfer1 { PReluPlugin::PReluPlugin(const std::vector& gamma) : gamma_(gamma) { } PReluPlugin::~PReluPlugin() { } // create the plugin at runtime from a byte stream PReluPlugin::PReluPlugin(const void* data, size_t length) { char *p = (char*)data; input_size_ = reinterpret_cast(p)[0]; p += sizeof(int); gamma_.assign((float*)p, (float*)p + (length - sizeof(int)) / sizeof(float)); } void PReluPlugin::serialize(void* buffer) const TRT_NOEXCEPT { *reinterpret_cast(buffer) = input_size_; char *p = reinterpret_cast(buffer); p += sizeof(int); memcpy(p, gamma_.data(), gamma_.size() * sizeof(float)); } size_t PReluPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(input_size_) + gamma_.size() * sizeof(float); } int PReluPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims PReluPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { assert(nbInputDims == 1); assert(index == 0); input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2]; // Output dimensions return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); } // Set plugin namespace void PReluPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* PReluPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType PReluPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool PReluPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool PReluPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void PReluPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void PReluPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void PReluPlugin::detachFromContext() TRT_NOEXCEPT {} const char* PReluPlugin::getPluginType() const TRT_NOEXCEPT { return "PRelu_TRT"; } const char* PReluPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void PReluPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* PReluPlugin::clone() const TRT_NOEXCEPT { PReluPlugin *p = new PReluPlugin(gamma_); p->input_size_ = input_size_; p->setPluginNamespace(mPluginNamespace); return p; } __global__ void prelu_kernel(const float *input, float *output, int num_elem, int input_size, int fm_size, const float* gamma) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= num_elem) return; if (input[idx] >= 0.0f) { output[idx] = input[idx]; return; } int c = (idx % input_size) / fm_size; output[idx] = input[idx] * gamma[c]; } void PReluPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int block_size = thread_count_; int grid_size = (input_size_ * batchSize + block_size - 1) / block_size; void *dev_gamma; assert(cudaMalloc(&dev_gamma, sizeof(float) * gamma_.size()) == cudaSuccess); assert(cudaMemcpy(dev_gamma, gamma_.data(), sizeof(float) * gamma_.size(), cudaMemcpyHostToDevice) == cudaSuccess); prelu_kernel<<>>(inputs[0], output, input_size_ * batchSize, input_size_, input_size_ / gamma_.size(), (const float*)dev_gamma); assert(cudaFree(dev_gamma) == cudaSuccess); } int PReluPlugin::enqueue(int batchSize, const void*const * inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection PReluPluginCreator::mFC{}; std::vector PReluPluginCreator::mPluginAttributes; PReluPluginCreator::PReluPluginCreator() { mPluginAttributes.emplace_back(PluginField("gamma", nullptr, PluginFieldType::kFLOAT32, 1)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* PReluPluginCreator::getPluginName() const TRT_NOEXCEPT { return "PRelu_TRT"; } const char* PReluPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* PReluPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* PReluPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { std::vector gamma; const PluginField* fields = fc->fields; for (int i = 0; i < fc->nbFields; ++i) { const char* attrName = fields[i].name; if (!strcmp(attrName, "gamma")) { assert(fields[i].type == PluginFieldType::kFLOAT32); int size = fields[i].length; gamma.reserve(size); const auto* w = static_cast(fields[i].data); for (int j = 0; j < size; j++) { gamma.push_back(*w); w++; } } } PReluPlugin* obj = new PReluPlugin(gamma); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* PReluPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call PReluPlugin::destroy() PReluPlugin* obj = new PReluPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: arcface/prelu.h ================================================ #ifndef _PRELU_PLUGIN_H #define _PRELU_PLUGIN_H #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class PReluPlugin: public IPluginV2IOExt { public: PReluPlugin(const std::vector& gamma); PReluPlugin(const void* data, size_t length); ~PReluPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; int input_size_; private: void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1); int thread_count_ = 256; std::vector gamma_; const char* mPluginNamespace; }; class PReluPluginCreator : public IPluginCreator { public: PReluPluginCreator(); ~PReluPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; }; #endif ================================================ FILE: centernet/README.md ================================================ # CenterNet This is the trt implementation of detection model [ctdet_coco_dla_2x](https://drive.google.com/open?id=1pl_-ael8wERdUREEnaIfqOV_VF2bEVRT) from [xingyizhou/CenterNet](https://github.com/xingyizhou/CenterNet) official work. ## How to Run 1. Follow [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) tutorial to build TensorRT7 2. Copy folder `dcnv2Plugin` to `TensorRT/plugin` and edit `InferPlugin.cpp` and `CMakeLists.txt` 3. Rebuild to install custom plugin 4. Use `tensorrt-7.2.3.4-cp36-none-linux_x86_64.whl` in TensorRT OSS to update your python-tensorrt 5. Run `python centernet.py -m ${PTH_PATH} -s` to create trt engine ## Sample ``` // Download ctdet_coco_dla_2x.pth and transfer it into trt engine first // Download the test img from https://raw.githubusercontent.com/tensorflow/models/master/research/deeplab/g3doc/img/image2.jpg or choose your own one cd sample python test.py ${ENGINE_PATH} ${IMG_PATH} ``` ![trt_out](https://user-images.githubusercontent.com/47047345/119128637-7a878900-ba68-11eb-91ff-5dcc10f01b77.jpg) ## TODO Integrate the post process with trt engine to make it more easier to use. ================================================ FILE: centernet/centernet.py ================================================ import numpy as np import tensorrt as trt import torch from sample import common import argparse import time # You can set the logger severity higher to suppress messages (or lower to display more messages). TRT_LOGGER = trt.Logger(trt.Logger.WARNING) trt.init_libnvinfer_plugins(TRT_LOGGER, '') PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list for plugin_creator in PLUGIN_CREATORS: if plugin_creator.name == 'DCNv2_TRT': dcnCreator = plugin_creator class ModelData(object): INPUT_NAME = "data" INPUT_SHAPE = (3, 512, 512) OUTPUT_NAME = "prob" DTYPE = trt.float16 class Centernet_dla34(object): def __init__(self, weights) -> None: super().__init__() self.weights = weights self.levels = [1, 1, 1, 2, 2, 1] self.channels = [16, 32, 64, 128, 256, 512] self.down_ratio = 4 self.last_level = 5 self.engine = self.build_engine() def add_batchnorm_2d(self, input_tensor, parent): gamma = self.weights[parent + '.weight'].numpy() beta = self.weights[parent + '.bias'].numpy() mean = self.weights[parent + '.running_mean'].numpy() var = self.weights[parent + '.running_var'].numpy() eps = 1e-5 scale = gamma / np.sqrt(var + eps) shift = beta - mean * gamma / np.sqrt(var + eps) power = np.ones_like(scale) return self.network.add_scale(input=input_tensor.get_output(0), mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale, power=power) def add_basic_block(self, input_tensor, out_channels, residual=None, stride=1, dilation=1, parent=''): conv1_w = self.weights[parent + '.conv1.weight'].numpy() conv1 = self.network.add_convolution(input=input_tensor.get_output( 0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv1_w) conv1.stride = (stride, stride) conv1.padding = (dilation, dilation) conv1.dilation = (dilation, dilation) bn1 = self.add_batchnorm_2d(conv1, parent + '.bn1') ac1 = self.network.add_activation( input=bn1.get_output(0), type=trt.ActivationType.RELU) conv2_w = self.weights[parent + '.conv2.weight'].numpy() conv2 = self.network.add_convolution(input=ac1.get_output( 0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv2_w) conv2.padding = (dilation, dilation) conv2.dilation = (dilation, dilation) out = self.add_batchnorm_2d(conv2, parent + '.bn2') if residual is None: out = self.network.add_elementwise(input_tensor.get_output( 0), out.get_output(0), trt.ElementWiseOperation.SUM) else: out = self.network.add_elementwise(residual.get_output( 0), out.get_output(0), trt.ElementWiseOperation.SUM) return self.network.add_activation(input=out.get_output(0), type=trt.ActivationType.RELU) def add_level(self, input_tensor, out_channels, stride=1, dilation=1, parent=''): conv1_w = self.weights[parent + '.0.weight'].numpy() conv1 = self.network.add_convolution(input=input_tensor.get_output( 0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv1_w) conv1.stride = (stride, stride) conv1.padding = (dilation, dilation) conv1.dilation = (dilation, dilation) bn1 = self.add_batchnorm_2d(conv1, parent + '.1') ac1 = self.network.add_activation( input=bn1.get_output(0), type=trt.ActivationType.RELU) return ac1 def add_root(self, input_tensors: list, out_channels, kernel_size=1, residual=False, parent=''): ct = self.network.add_concatenation( [x.get_output(0) for x in input_tensors]) conv_w = self.weights[parent + '.conv.weight'].numpy() conv = self.network.add_convolution(input=ct.get_output( 0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=conv_w) conv.padding = ((kernel_size - 1) // 2, (kernel_size - 1) // 2) bn1 = self.add_batchnorm_2d(conv, parent + '.bn') out = self.network.add_activation( input=bn1.get_output(0), type=trt.ActivationType.RELU) if residual: out = self.network.add_elementwise(input_tensors[0].get_output( 0), out.get_output(0), trt.ElementWiseOperation.SUM) return self.network.add_activation(input=out.get_output(0), type=trt.ActivationType.RELU) def add_tree(self, input_tensor, level, out_channels, residual=None, children=None, stride=1, level_root=False, parent=''): children = [] if children is None else children if stride > 1: bottom = self.network.add_pooling(input_tensor.get_output( 0), trt.PoolingType.MAX, (stride, stride)) bottom.stride = (stride, stride) else: bottom = input_tensor if input_tensor.get_output(0).shape[0] != out_channels: project_conv1_w = self.weights[parent + '.project.0.weight'].numpy() project_conv1 = self.network.add_convolution(input=bottom.get_output( 0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=project_conv1_w) residual = self.add_batchnorm_2d( project_conv1, parent + '.project.1') else: residual = bottom if level_root: children.append(bottom) if level == 1: tree1 = self.add_basic_block( input_tensor, out_channels, residual, stride, parent=parent+'.tree1') tree2 = self.add_basic_block( tree1, out_channels, parent=parent+'.tree2') return self.add_root([tree2, tree1]+children, out_channels, parent=parent+'.root') else: tree1 = self.add_tree(input_tensor, level-1, out_channels, residual, stride=stride, parent=parent+'.tree1') children.append(tree1) return self.add_tree(tree1, level-1, out_channels, children=children, parent=parent+'.tree2') def add_base(self, input_tensor, parent): base_conv1_w = self.weights[parent+'.base_layer.0.weight'].numpy() base_conv1 = self.network.add_convolution( input=input_tensor, num_output_maps=self.channels[0], kernel_shape=(7, 7), kernel=base_conv1_w) base_conv1.padding = (3, 3) base_bn1 = self.add_batchnorm_2d(base_conv1, parent+'.base_layer.1') base_ac1 = self.network.add_activation( input=base_bn1.get_output(0), type=trt.ActivationType.RELU) level0 = self.add_level( base_ac1, self.channels[0], parent=parent+'.level0') level1 = self.add_level( level0, self.channels[1], 2, parent=parent+'.level1') level2 = self.add_tree( level1, self.levels[2], self.channels[2], stride=2, level_root=False, parent=parent+'.level2') level3 = self.add_tree( level2, self.levels[3], self.channels[3], stride=2, level_root=True, parent=parent+'.level3') level4 = self.add_tree( level3, self.levels[4], self.channels[4], stride=2, level_root=True, parent=parent+'.level4') level5 = self.add_tree( level4, self.levels[5], self.channels[5], stride=2, level_root=True, parent=parent+'.level5') return [level0, level1, level2, level3, level4, level5] def add_deform_conv(self, input_tensor, out_channels, kernel=3, stride=1, padding=1, dilation=1, deformable_group=1, parent=''): conv_offset_mask_w = self.weights[parent + '.conv.conv_offset_mask.weight'].numpy() conv_offset_mask_b = self.weights[parent + '.conv.conv_offset_mask.bias'].numpy() conv_offset_mask = self.network.add_convolution(input=input_tensor.get_output(0), num_output_maps=deformable_group*3*kernel*kernel, kernel_shape=( kernel, kernel), kernel=conv_offset_mask_w, bias=conv_offset_mask_b) conv_offset_mask.stride = (stride, stride) conv_offset_mask.padding = (padding, padding) out_channels = trt.PluginField("out_channels", np.array( [out_channels], dtype=np.int32), trt.PluginFieldType.INT32) kernel = trt.PluginField("kernel", np.array( [kernel], dtype=np.int32), trt.PluginFieldType.INT32) deformable_group = trt.PluginField("deformable_group", np.array( [deformable_group], dtype=np.int32), trt.PluginFieldType.INT32) dilation = trt.PluginField("dilation", np.array( [dilation], dtype=np.int32), trt.PluginFieldType.INT32) padding = trt.PluginField("padding", np.array( [padding], dtype=np.int32), trt.PluginFieldType.INT32) stride = trt.PluginField("stride", np.array( [stride], dtype=np.int32), trt.PluginFieldType.INT32) weight = trt.PluginField( "weight", self.weights[parent + '.conv.weight'].numpy(), trt.PluginFieldType.FLOAT32) bias = trt.PluginField( "bias", self.weights[parent + '.conv.bias'].numpy(), trt.PluginFieldType.FLOAT32) field_collection = trt.PluginFieldCollection( [out_channels, kernel, deformable_group, dilation, padding, stride, weight, bias]) DCN = dcnCreator.create_plugin( name='DCNv2_TRT', field_collection=field_collection) sigmoid_conv_offset_mask = self.network.add_activation( input=conv_offset_mask.get_output(0), type=trt.ActivationType.SIGMOID) dcn = self.network.add_plugin_v2(inputs=[input_tensor.get_output( 0), conv_offset_mask.get_output(0), sigmoid_conv_offset_mask.get_output(0)], plugin=DCN) bn = self.add_batchnorm_2d(dcn, parent+'.actf.0') return self.network.add_activation(input=bn.get_output(0), type=trt.ActivationType.RELU) def add_ida_up(self, input_tensors, out_channels, up_f, startp, parent): for i in range(startp + 1, len(input_tensors)): proj = self.add_deform_conv( input_tensors[i], out_channels, parent=parent+'.proj_%d' % (i-startp)) f = up_f[i-startp] up_w = self.weights[parent + '.up_%d.weight' % (i-startp)].numpy() up = self.network.add_deconvolution( proj.get_output(0), out_channels, (f*2, f*2), up_w) up.stride = (f, f) up.padding = (f//2, f//2) up.num_groups = out_channels node = self.network.add_elementwise( input_tensors[i-1].get_output(0), up.get_output(0), trt.ElementWiseOperation.SUM) input_tensors[i] = self.add_deform_conv( node, out_channels, parent=parent+'.node_%d' % (i-startp)) return input_tensors def add_dla_up(self, input_tensors, first_level, parent): channels = self.channels[first_level:] scales = [2 ** i for i in range(len(self.channels[first_level:]))] scales = np.array(scales, dtype=int) out = [input_tensors[-1]] for i in range(len(channels) - 1): j = -i - 2 input_tensors = self.add_ida_up( input_tensors, channels[j], scales[j:] // scales[j], len(input_tensors) - i - 2, parent+'.ida_%d' % i) out.insert(0, input_tensors[-1]) scales[j + 1:] = scales[j] channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] return out def add_head(self, input_tensor, out_channels, head, head_conv=256, final_kernal=1): conv1_w = self.weights[head+'.0.weight'].numpy() conv1_b = self.weights[head+'.0.bias'].numpy() conv1 = self.network.add_convolution( input_tensor.get_output(0), head_conv, (3, 3), conv1_w, conv1_b) conv1.padding = (1, 1) ac1 = self.network.add_activation( input=conv1.get_output(0), type=trt.ActivationType.RELU) conv2_w = self.weights[head + '.2.weight'].numpy() conv2_b = self.weights[head+'.2.bias'].numpy() conv2 = self.network.add_convolution(ac1.get_output( 0), out_channels, (final_kernal, final_kernal), conv2_w, conv2_b) return conv2 def populate_network(self): # Configure the network layers based on the self.weights provided. input_tensor = self.network.add_input( name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE) y = self.add_base(input_tensor, 'module.base') first_level = int(np.log2(self.down_ratio)) last_level = self.last_level dla_up = self.add_dla_up(y, first_level, 'module.dla_up') ida_up = self.add_ida_up(dla_up[:last_level-first_level], self.channels[first_level], [ 2 ** i for i in range(last_level - first_level)], 0, 'module.ida_up') hm = self.add_head(ida_up[-1], 80, 'module.hm') wh = self.add_head(ida_up[-1], 2, 'module.wh') reg = self.add_head(ida_up[-1], 2, 'module.reg') hm.get_output(0).name = 'hm' wh.get_output(0).name = 'wh' reg.get_output(0).name = 'reg' self.network.mark_output(tensor=hm.get_output(0)) self.network.mark_output(tensor=wh.get_output(0)) self.network.mark_output(tensor=reg.get_output(0)) def build_engine(self): # For more information on TRT basics, refer to the introductory samples. with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network: self.network = network builder.max_workspace_size = common.GiB(1) builder.max_batch_size = 1 # Populate the network using self.weights from the PyTorch model. self.populate_network() # Build and return an engine. return builder.build_cuda_engine(self.network) def load_random_test_case(pagelocked_buffer): # Select an image at random to be the test case. img = np.random.randn(1, 3, 512, 512).astype(np.float32) # Copy to the pagelocked input buffer np.copyto(pagelocked_buffer, img.ravel()) return img def main(args): # Get the PyTorch weights weights = torch.load(args.model, map_location={ 'cuda:0': 'cpu'})['state_dict'] # Do inference with TensorRT. with Centernet_dla34(weights).engine as engine: if args.save_engine: with open('centernet.engine', "wb") as f: f.write(engine.serialize()) inputs, outputs, bindings, stream = common.allocate_buffers(engine) with engine.create_execution_context() as context: img = load_random_test_case(pagelocked_buffer=inputs[0].host) # For more information on performing inference, refer to the introductory samples. # The common.do_inference function will return a list of outputs - we only have one in this case. t = time.time() [hm, wh, reg] = common.do_inference( context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1) t = time.time() - t print('output: hm:%f, wh:%f, reg:%f' % (hm.mean(), wh.mean(), reg.mean())) print(t) if __name__ == '__main__': parser = argparse.ArgumentParser(description='CenterNet dla34 ctdet') parser.add_argument('--model', '-m', type=str, default='./ctdet_coco_dla_2x.pth', help='path of pytorch .pth') parser.add_argument('--save_engine', '-s', action='store_true', help='if save trt engine') args = parser.parse_args() main(args) ================================================ FILE: centernet/dcnv2Plugin/CMakeLists.txt ================================================ # # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # file(GLOB SRCS *.cpp) set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS}) set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE) file(GLOB CU_SRCS *.cu) set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS}) set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE) ================================================ FILE: centernet/dcnv2Plugin/dcn_v2_im2col_cuda.cu ================================================ #include "dcn_v2_im2col_cuda.h" #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 512; //inline int GET_BLOCKS(const int N) //{ // return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; //} dim3 GET_BLOCKS(uint n) { uint k = (n - 1) /CUDA_NUM_THREADS + 1; uint x = k ; uint y = 1 ; if (x > 65535 ) { x = ceil(sqrt(x)); y = (n - 1 )/(x*CUDA_NUM_THREADS) + 1; } dim3 d = {x,y,1} ; return d; } __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width, const int height, const int width, float h, float w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; float lh = h - h_low; float lw = w - w_low; float hh = 1 - lh, hw = 1 - lw; float v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; float v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; float v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; float v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } __device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; float weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w, const int height, const int width, const float *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { //empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; float weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } __global__ void modulated_deformable_im2col_gpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, float *data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; float val = static_cast(0); const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { //const float map_h = i * dilation_h + offset_h; //const float map_w = j * dilation_w + offset_w; //const int cur_height = height - h_in; //const int cur_width = width - w_in; //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val * mask; data_col_ptr += batch_size * height_col * width_col; //data_col_ptr += height_col * width_col; } } } } __global__ void modulated_deformable_col2im_gpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, float *grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; const float cur_inv_h_data = h_in + i * dilation_h + offset_h; const float cur_inv_w_data = w_in + j * dilation_w + offset_w; const float cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, float *grad_offset, float *grad_mask) { CUDA_KERNEL_LOOP(index, n) { float val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const float offset_h = data_offset_ptr[data_offset_h_ptr]; const float offset_w = data_offset_ptr[data_offset_w_ptr]; const float mask = data_mask_ptr[data_mask_hw_ptr]; float inv_h = h_in + i * dilation_h + offset_h; float inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); } const float weight = dmcn_get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } void modulated_deformable_im2col_cuda(cudaStream_t stream, const float* data_im, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* data_col) { // num_axes should be smaller than block size const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * batch_size * height_col * width_col; modulated_deformable_im2col_gpu_kernel <<>>( num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, channels, deformable_group, height_col, width_col, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_cuda(cudaStream_t stream, const float* data_col, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* grad_im){ const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; modulated_deformable_col2im_gpu_kernel <<>>( num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, deformable_group, height_col, width_col, grad_im); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float* grad_offset, float* grad_mask) { const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; modulated_deformable_col2im_coord_gpu_kernel <<>>( num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, grad_offset, grad_mask); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: centernet/dcnv2Plugin/dcn_v2_im2col_cuda.h ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ******************** * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.h * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1811.11168 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu */ /***************** Adapted by Charles Shang *********************/ #ifndef DCN_V2_IM2COL_CUDA #define DCN_V2_IM2COL_CUDA // #ifdef __cplusplus // extern "C" // { // #endif void modulated_deformable_im2col_cuda(cudaStream_t stream, const float *data_im, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *data_col); void modulated_deformable_col2im_cuda(cudaStream_t stream, const float *data_col, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *grad_im); void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, float *grad_offset, float *grad_mask); // #ifdef __cplusplus // } // #endif #endif ================================================ FILE: centernet/dcnv2Plugin/dcnv2Plugin.cpp ================================================ #include "dcnv2Plugin.h" #include using namespace nvinfer1; using nvinfer1::plugin::DeformableConvolutionalLayer; using nvinfer1::plugin::DCNv2PluginCreator; namespace { const char* DCNv2_PLUGIN_VERSION{"1"}; const char* DCNv2_PLUGIN_NAME{"DCNv2_TRT"}; } // namespace #define CHECK_CUDA(call) \ do \ { \ cudaError_t status = call; \ if (status != cudaSuccess) \ { \ return status; \ } \ } while (0) PluginFieldCollection DCNv2PluginCreator::mFC{}; std::vector DCNv2PluginCreator::mPluginAttributes; // Parameterized constructor DeformableConvolutionalLayer::DeformableConvolutionalLayer( int out_channels, int kernel, int deformable_group, int dilation, int padding, int stride, const Weights* weight, const Weights* bias): out_channels(out_channels),kernel_size(kernel),deformable_group(deformable_group), dilation(dilation),padding(padding),stride(stride){ mWeight = copyToDevice(weight[0].values, weight[0].count); mBias = copyToDevice(bias[0].values, bias[0].count); } DeformableConvolutionalLayer::DeformableConvolutionalLayer(const void* buffer, size_t length) { const char* d = static_cast(buffer); const char* a = d; in_channels = read(d); height = read(d); width = read(d); height_out = read(d); width_out = read(d); out_channels = read(d); kernel_size = read(d); deformable_group = read(d); dilation = read(d); padding = read(d); stride = read(d); int count = read(d); mWeight = deserializeToDevice(d, count); count = read(d); mBias = deserializeToDevice(d, count); ASSERT(d == a + length); } int DeformableConvolutionalLayer::getNbOutputs() const { // Plugin layer has 2 outputs return 1; } int DeformableConvolutionalLayer::initialize() { size_t oneSize = height_out * width_out * sizeof(float); std::vector one_((int)oneSize, 1.0f); CHECK_CUDA(cudaMalloc((void**)&mOne, oneSize)); CHECK_CUDA(cudaMalloc((void**)&mColumn, in_channels * kernel_size * kernel_size * oneSize)); CHECK_CUDA(cudaMemcpy(mOne, one_.data(), oneSize, cudaMemcpyHostToDevice)); return STATUS_SUCCESS; } Dims DeformableConvolutionalLayer::getOutputDimensions(int index, const Dims* inputs, int nbInputs) { ASSERT(index == 0); ASSERT(nbInputs == 3); in_channels = inputs[0].d[0]; height = inputs[0].d[1]; width = inputs[0].d[2]; height_out = (inputs[0].d[1] + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1; width_out = (inputs[0].d[2] + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1; return Dims3(out_channels, height_out, width_out); } size_t DeformableConvolutionalLayer::getWorkspaceSize(int maxBatchSize) const { return 0; } int DeformableConvolutionalLayer::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { const float* input = static_cast(inputs[0]); const float* offset = static_cast(inputs[1]); const float* offset_mask = static_cast(inputs[2]); const float* mask = offset_mask + deformable_group * 2 * kernel_size * kernel_size * height * width; float * output = static_cast(outputs[0]); float alpha{1}, beta{0}; // Do Bias first: // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) // (N x 1) (1 x M) int m_ = out_channels; int n_ = height_out * width_out; int k_ = 1; cublasSgemm(mCublas, CUBLAS_OP_T, CUBLAS_OP_N, n_, m_, k_, &alpha, mOne, k_, static_cast(mBias.values), k_, &beta, output, n_); modulated_deformable_im2col_cuda(stream, input, offset, mask, 1, in_channels, height, width, height_out, width_out, kernel_size, kernel_size, padding, padding, stride, stride, dilation, dilation, deformable_group, mColumn); //(k * m) x (m * n) // Y = WC int m = out_channels; int n = height_out * width_out; int k = in_channels * kernel_size * kernel_size; cublasSgemm(mCublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, mColumn, n, static_cast(mWeight.values), k, &alpha, output, n); return 0; } size_t DeformableConvolutionalLayer::getSerializationSize() const { return sizeof(int) * 13 + (mWeight.count + mBias.count) * sizeof(float); } void DeformableConvolutionalLayer::serialize(void* buffer) const { char *d = reinterpret_cast(buffer), *a = d; write(d, in_channels); write(d, height); write(d, width); write(d, height_out); write(d, width_out); write(d, out_channels); write(d, kernel_size); write(d, deformable_group); write(d, dilation); write(d, padding); write(d, stride); write(d, (int) mWeight.count); serializeFromDevice(d, mWeight); write(d, (int) mBias.count); serializeFromDevice(d, mBias); ASSERT(d == a + getSerializationSize()); } bool DeformableConvolutionalLayer::supportsFormat(DataType type, PluginFormat format) const { return (type == DataType::kFLOAT && format == PluginFormat::kNCHW); } Weights DeformableConvolutionalLayer::copyToDevice(const void* hostData, size_t count) { void* deviceData; CUASSERT(cudaMalloc(&deviceData, count * sizeof(float))); CUASSERT(cudaMemcpy(deviceData, hostData, count * sizeof(float), cudaMemcpyHostToDevice)); return Weights{DataType::kFLOAT, deviceData, int64_t(count)}; } void DeformableConvolutionalLayer::serializeFromDevice(char*& hostBuffer, Weights deviceWeights) const { CUASSERT(cudaMemcpy(hostBuffer, deviceWeights.values, deviceWeights.count * sizeof(float), cudaMemcpyDeviceToHost)); hostBuffer += deviceWeights.count * sizeof(float); } Weights DeformableConvolutionalLayer::deserializeToDevice(const char*& hostBuffer, size_t count) { Weights w = copyToDevice(hostBuffer, count); hostBuffer += count * sizeof(float); return w; } const char* DeformableConvolutionalLayer::getPluginType() const { return DCNv2_PLUGIN_NAME; } const char* DeformableConvolutionalLayer::getPluginVersion() const { return DCNv2_PLUGIN_VERSION; } void DeformableConvolutionalLayer::terminate() { if (mOne) { cudaFree(mOne); mOne = nullptr; } if (mColumn) { cudaFree(mColumn); mColumn = nullptr; } } void DeformableConvolutionalLayer::destroy() { delete this; } IPluginV2Ext* DeformableConvolutionalLayer::clone() const { IPluginV2Ext* plugin = new DeformableConvolutionalLayer(*this); plugin->setPluginNamespace(mPluginNamespace.c_str()); return plugin; } // Set plugin namespace void DeformableConvolutionalLayer::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* DeformableConvolutionalLayer::getPluginNamespace() const { return mPluginNamespace.c_str(); } // Return the DataType of the plugin output at the requested index. DataType DeformableConvolutionalLayer::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { // Only DataType::kFLOAT is acceptable by the plugin layer return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool DeformableConvolutionalLayer::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool DeformableConvolutionalLayer::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } // Configure the layer with input and output data types. // inutDims: input Dimensions for the plugin layer // nInputs : Number of inputs to the plugin layer // outputDims: output Dimensions from the plugin layer // nOutputs: number of outputs from the plugin layer // type: DataType configuration for the plugin layer // format: format NCHW, NHWC etc // maxbatchSize: maximum batch size for the plugin layer void DeformableConvolutionalLayer::configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) { ASSERT(*inputTypes == DataType::kFLOAT && floatFormat == PluginFormat::kNCHW); } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void DeformableConvolutionalLayer::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { mCublas = cublasContext; } // Detach the plugin object from its execution context. void DeformableConvolutionalLayer::detachFromContext() {} DCNv2PluginCreator::DCNv2PluginCreator() { mPluginAttributes.emplace_back(PluginField("out_channels", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(PluginField("kernel", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(PluginField("deformable_group", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(PluginField("dilation", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(PluginField("padding", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(PluginField("stride", nullptr, PluginFieldType::kINT32, 1)); mPluginAttributes.emplace_back(PluginField("weight", nullptr, PluginFieldType::kFLOAT32, 1)); mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* DCNv2PluginCreator::getPluginName() const { return DCNv2_PLUGIN_NAME; } const char* DCNv2PluginCreator::getPluginVersion() const { return DCNv2_PLUGIN_VERSION; } const PluginFieldCollection* DCNv2PluginCreator::getFieldNames() { return &mFC; } IPluginV2Ext* DCNv2PluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { std::vector weight; std::vector bias; int out_channels, kernel, deformable_group, padding, stride, dilation; const PluginField* fields = fc->fields; for (int i = 0; i < fc->nbFields; ++i) { const char* attrName = fields[i].name; if (!strcmp(attrName, "out_channels")) { ASSERT(fields[i].type == PluginFieldType::kINT32); out_channels = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "kernel")) { ASSERT(fields[i].type == PluginFieldType::kINT32); kernel = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "deformable_group")) { ASSERT(fields[i].type == PluginFieldType::kINT32); deformable_group = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "dilation")) { ASSERT(fields[i].type == PluginFieldType::kINT32); dilation = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "stride")) { ASSERT(fields[i].type == PluginFieldType::kINT32); stride = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "padding")) { ASSERT(fields[i].type == PluginFieldType::kINT32); padding = *(static_cast(fields[i].data)); } else if (!strcmp(attrName, "weight")) { ASSERT(fields[i].type == PluginFieldType::kFLOAT32); int size = fields[i].length; weight.reserve(size); const auto* w = static_cast(fields[i].data); for (int j = 0; j < size; j++) { weight.push_back(*w); w++; } } else if (!strcmp(attrName, "bias")) { ASSERT(fields[i].type == PluginFieldType::kFLOAT32); int size = fields[i].length; bias.reserve(size); const auto* w = static_cast(fields[i].data); for (int j = 0; j < size; j++) { bias.push_back(*w); w++; } } } Weights mWeight{DataType::kFLOAT, weight.data(), (int64_t) weight.size()}; Weights mBias{DataType::kFLOAT, bias.data(), (int64_t) bias.size()}; DeformableConvolutionalLayer* obj = new DeformableConvolutionalLayer(out_channels, kernel, deformable_group, dilation, padding, stride, &mWeight, &mBias); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2Ext* DCNv2PluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call Normalize::destroy() DeformableConvolutionalLayer* obj = new DeformableConvolutionalLayer(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } ================================================ FILE: centernet/dcnv2Plugin/dcnv2Plugin.h ================================================ /* * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TRT_DCNV2_PLUGIN_H #define TRT_DCNV2_PLUGIN_H #include "kernel.h" #include "plugin.h" #include "dcn_v2_im2col_cuda.h" #include "serialize.hpp" #include #include #include #include #include #include using namespace nvinfer1::plugin; namespace nvinfer1 { namespace plugin { class DeformableConvolutionalLayer : public IPluginV2Ext { public: DeformableConvolutionalLayer(int out_channels, int kernel, int deformable_group, int dilation, int padding, int stride, const Weights* weight, const Weights* bias); DeformableConvolutionalLayer(const void* buffer, size_t length); ~DeformableConvolutionalLayer() override = default; int getNbOutputs() const override; Dims getOutputDimensions(int index, const Dims* inputs, int nbInputs) override; int initialize() override; void terminate() override; size_t getWorkspaceSize(int maxBatchSize) const override; int enqueue( int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override; size_t getSerializationSize() const override; void serialize(void* buffer) const override; bool supportsFormat(DataType type, PluginFormat format) const override; const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2Ext* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) override; void detachFromContext() override; private: Weights copyToDevice(const void* hostData, size_t count); void serializeFromDevice(char*& hostBuffer, Weights deviceWeights) const; Weights deserializeToDevice(const char*& hostBuffer, size_t count); std::string mPluginNamespace; int in_channels{}; int height_out{}; int width_out{}; int height{}; int width{}; int out_channels{}; int kernel_size{}; int deformable_group{}; int dilation{}; int padding{}; int stride{}; Weights mWeight{}; Weights mBias{}; float* mOne; float* mColumn; cublasHandle_t mCublas; }; class DCNv2PluginCreator : public BaseCreator { public: DCNv2PluginCreator(); ~DCNv2PluginCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2Ext* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2Ext* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; private: static PluginFieldCollection mFC; // Parameters for DeformableConvolutionalLayer static std::vector mPluginAttributes; }; } // namespace plugin } // namespace nvinfer1 #endif // TRT_DCNv2_PLUGIN_H ================================================ FILE: centernet/sample/common.py ================================================ # # Copyright 1993-2020 NVIDIA Corporation. All rights reserved. # # NOTICE TO LICENSEE: # # This source code and/or documentation ("Licensed Deliverables") are # subject to NVIDIA intellectual property rights under U.S. and # international Copyright laws. # # These Licensed Deliverables contained herein is PROPRIETARY and # CONFIDENTIAL to NVIDIA and is being provided under the terms and # conditions of a form of NVIDIA software license agreement by and # between NVIDIA and Licensee ("License Agreement") or electronically # accepted by Licensee. Notwithstanding any terms or conditions to # the contrary in the License Agreement, reproduction or disclosure # of the Licensed Deliverables to any third party without the express # written consent of NVIDIA is prohibited. # # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE # LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE # SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS # PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. # NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED # DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, # NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. # NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE # LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY # SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THESE LICENSED DELIVERABLES. # # U.S. Government End Users. These Licensed Deliverables are a # "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT # 1995), consisting of "commercial computer software" and "commercial # computer software documentation" as such terms are used in 48 # C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government # only as a commercial end item. Consistent with 48 C.F.R.12.212 and # 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all # U.S. Government End Users acquire the Licensed Deliverables with # only those rights set forth herein. # # Any use of the Licensed Deliverables in individual and commercial # software must include, in the user documentation and internal # comments to the code, the above Disclaimer and U.S. Government End # Users Notice. # from itertools import chain import argparse import os import pycuda.driver as cuda import pycuda.autoinit import numpy as np import tensorrt as trt try: # Sometimes python2 does not understand FileNotFoundError FileNotFoundError except NameError: FileNotFoundError = IOError EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) def GiB(val): return val * 1 << 30 def add_help(description): parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) args, _ = parser.parse_known_args() def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""): ''' Parses sample arguments. Args: description (str): Description of the sample. subfolder (str): The subfolder containing data relevant to this sample find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path. Returns: str: Path of data directory. ''' # Standard command-line arguments for all samples. kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data") parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT]) args, _ = parser.parse_known_args() def get_data_path(data_dir): # If the subfolder exists, append it to the path, otherwise use the provided path as-is. data_path = os.path.join(data_dir, subfolder) if not os.path.exists(data_path): if data_dir != kDEFAULT_DATA_ROOT: print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.") data_path = data_dir # Make sure data directory exists. if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT: print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path)) return data_path data_paths = [get_data_path(data_dir) for data_dir in args.datadir] return data_paths, locate_files(data_paths, find_files, err_msg) def locate_files(data_paths, filenames, err_msg=""): """ Locates the specified files in the specified data directories. If a file exists in multiple data directories, the first directory is used. Args: data_paths (List[str]): The data directories. filename (List[str]): The names of the files to find. Returns: List[str]: The absolute paths of the files. Raises: FileNotFoundError if a file could not be located. """ found_files = [None] * len(filenames) for data_path in data_paths: # Find all requested files. for index, (found, filename) in enumerate(zip(found_files, filenames)): if not found: file_path = os.path.abspath(os.path.join(data_path, filename)) if os.path.exists(file_path): found_files[index] = file_path # Check that all files were found for f, filename in zip(found_files, filenames): if not f or not os.path.exists(f): raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)) return found_files # Simple helper data class that's a little nicer to use than a 2-tuple. class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream # This function is generalized for multiple inputs/outputs. # inputs and outputs are expected to be lists of HostDeviceMem objects. def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] # This function is generalized for multiple inputs/outputs for full dimension networks. # inputs and outputs are expected to be lists of HostDeviceMem objects. def do_inference_v2(context, bindings, inputs, outputs, stream): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] # `retry_call` and `retry` are used to wrap the function we want to try multiple times def retry_call(func, args=[], kwargs={}, n_retries=3): """Wrap a function to retry it several times. Args: func: function to call args (List): args parsed to func kwargs (Dict): kwargs parsed to func n_retries (int): maximum times of tries """ for i_try in range(n_retries): try: func(*args, **kwargs) break except: if i_try == n_retries - 1: raise print("retry...") # Usage: @retry(n_retries) def retry(n_retries=3): """Wrap a function to retry it several times. Decorator version of `retry_call`. Args: n_retries (int): maximum times of tries Usage: @retry(n_retries) def func(...): pass """ def wrapper(func): def _wrapper(*args, **kwargs): retry_call(func, args, kwargs, n_retries) return _wrapper return wrapper ================================================ FILE: centernet/sample/test.py ================================================ import cv2 as cv import numpy as np import tensorrt as trt import common import torch import time from sys import argv # You can set the logger severity higher to suppress messages (or lower to display more messages). TRT_LOGGER = trt.Logger(trt.Logger.WARNING) trt.init_libnvinfer_plugins(TRT_LOGGER, '') def _gather_feat(feat, ind, mask=None): dim = feat.size(2) ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) feat = feat.gather(1, ind) if mask is not None: mask = mask.unsqueeze(2).expand_as(feat) feat = feat[mask] feat = feat.view(-1, dim) return feat def _transpose_and_gather_feat(feat, ind): feat = feat.permute(0, 2, 3, 1).contiguous() feat = feat.view(feat.size(0), -1, feat.size(3)) feat = _gather_feat(feat, ind) return feat def pre_process(image): long_size = max(image.shape) img = np.zeros((long_size, long_size, 3)) img[:image.shape[0], :img.shape[1], :] = image[:] img = cv.resize(img, (512,512)) inp_image = ((img / 255. - 0.5) / 0.5).astype(np.float32) images = inp_image.transpose(2, 0, 1) return images, long_size/512 def _nms(heat, kernel=3): pad = (kernel - 1) // 2 hmax = torch.nn.functional.max_pool2d( heat, (kernel, kernel), stride=1, padding=pad) keep = (hmax == heat).float() return heat * keep def _topk(scores, K=40): batch, cat, height, width = scores.size() topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) topk_inds = topk_inds % (height * width) topk_ys = (topk_inds.true_divide(width)).int().float() topk_xs = (topk_inds % width).int().float() topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) topk_clses = (topk_ind.true_divide(K)).int() topk_inds = _gather_feat( topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) return topk_score, topk_inds, topk_clses, topk_ys, topk_xs def ctdet_decode(heat, wh, reg=None, cat_spec_wh=False, K=100): batch, cat, height, width = heat.size() heat = torch.sigmoid(heat) # perform nms on heatmaps heat = _nms(heat) scores, inds, clses, ys, xs = _topk(heat, K=K) if reg is not None: reg = _transpose_and_gather_feat(reg, inds) reg = reg.view(batch, K, 2) xs = xs.view(batch, K, 1) + reg[:, :, 0:1] ys = ys.view(batch, K, 1) + reg[:, :, 1:2] else: xs = xs.view(batch, K, 1) + 0.5 ys = ys.view(batch, K, 1) + 0.5 wh = _transpose_and_gather_feat(wh, inds) if cat_spec_wh: wh = wh.view(batch, K, cat, 2) clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long() wh = wh.gather(2, clses_ind).view(batch, K, 2) else: wh = wh.view(batch, K, 2) clses = clses.view(batch, K, 1).float() scores = scores.view(batch, K, 1) bboxes = torch.cat([xs - wh[..., 0:1] / 2, ys - wh[..., 1:2] / 2, xs + wh[..., 0:1] / 2, ys + wh[..., 1:2] / 2], dim=2) detections = torch.cat([bboxes, scores, clses], dim=2) return detections if __name__ == '__main__': try: engine_path = argv[1] img_path = argv[2] except: print('engine path and image path are needed!') exit() with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine: inputs, outputs, bindings, stream = common.allocate_buffers(engine) with engine.create_execution_context() as context: img = cv.imread('test.jpg') dis = img.copy() img, s = pre_process(img) # Copy to the pagelocked input buffer np.copyto(inputs[0].host, img.ravel()) [hm, wh, reg] = common.do_inference( context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1) [dets] = ctdet_decode(torch.from_numpy(hm.reshape(1, 80, 128, 128)), torch.from_numpy( wh.reshape(1, 2, 128, 128)), torch.from_numpy(reg.reshape(1, 2, 128, 128))) for i in dets: if i[-2] > 0.5: i[:4] *= 4*s cv.rectangle(dis, (int(i[0]), int( i[1])), (int(i[2]), int(i[3])), 255, 1) cv.putText(dis, '%d' % int(i[-1]), (int(i[0]), int(i[1])), 1, 1, 255) cv.imwrite('trt_out.jpg', dis) ================================================ FILE: contributing.md ================================================ # How to Contribute 1. Fork this repo to your github account 2. Clone your fork 3. Create a feature branch 4. Make changes, including but not limited to create new model, bug fix, documentation, tutorials, etc. 5. Pre-commit check and push, we use clang-format to do coding style checking, and the coding style is following google c++ coding style with 4-space. ```bash pip install pre-commit clang-format cd tensorrtx pre-commit install git add [files-to-commit] pre-commit run # fix pre-commit errors, then git add files-to-commit again git add [files-to-commit] git commit -m "describe your commit" git push origin [feature-branch] ``` 6. Submit a pull-request on github web UI to master branch of wang-xinyu/tensorrtx. ================================================ FILE: convnextv2/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(convnextv2) find_package(CUDA REQUIRED) find_package(OpenCV REQUIRED) include_directories(${CUDA_INCLUDE_DIRS} /usr/local/cuda/include /usr/local/TensorRT-8.6.1.6/include) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/local/cuda/lib64 /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib) # TRT find_library(NVINFER nvinfer PATHS /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib NO_DEFAULT_PATH) find_library(NVINFER_PLUGIN nvinfer_plugin PATHS /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib NO_DEFAULT_PATH) find_library(NVPARSERS nvparsers PATHS /usr/local/TensorRT-8.6.1.6/targets/x86_64-linux-gnu/lib NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 14) cuda_add_executable(convnextv2 src/convnextv2.cpp src/LayerNormPlugin.cu) target_link_libraries(convnextv2 ${NVINFER} ${NVINFER_PLUGIN} ${CUDA_LIBRARIES} ${OpenCV_LIBS}) cuda_add_library(layernorm_plugin SHARED src/LayerNormPlugin.cu) target_link_libraries(layernorm_plugin ${NVINFER} ${NVINFER_PLUGIN} ${CUDA_LIBRARIES}) # Inference executable cuda_add_executable(inference_cpp src/inference_cpp.cpp src/LayerNormPlugin.cu) target_link_libraries(inference_cpp ${NVINFER} ${NVINFER_PLUGIN} ${CUDA_LIBRARIES} ${OpenCV_LIBS}) ================================================ FILE: convnextv2/README.md ================================================ # ConvNeXtV2 TensorRT ## Environment - ubuntu20.04 - cuda11.8 - cudnn8.9.7 - TensorRT8.6.1.6 - OpenCV4.13 ## Support [ConvNext-V2](https://github.com/facebookresearch/ConvNeXt-V2.git)provides official pre-trained models such as ImageNet-1K fine-tuned models, ImageNet-22K fine-tuned models, and custom dataset classification models trained using these pre-trained weights. ## Build and Run `````` # Downloda dependencies pip install torch tensorrt pycuda numpy opencv-python # Generate .wts cd path-to-tensorrtx/convnextv2 python path-to-gen_wts.py path-to-pt path-to-wts # Build convnextv2 cmake -B build make -C build # Update config.yaml to match your selected model # Generate .engine ./build/convnextv2 path-to-wts path-to-engine # Inference(python) python path-to-inference.py path-to-engine path-to-your-image path-to-your-labels.txt # Inference(cpp) ./build/inference_cpp path-to-engine path-to-your-image path-to-your-labels.txt `````` ## More Information An interesting fact is that the suffix of the engine file can be arbitrarily specified; it does not need to be “engine”, and you can even use your own name as the suffix. ================================================ FILE: convnextv2/config.yaml ================================================ # ConvNeXtV2 Configuration # Model variants reference: # Atto: depths: [2, 2, 6, 2], dims: [40, 80, 160, 320] # Femto: depths: [2, 2, 6, 2], dims: [48, 96, 192, 384] # Pico: depths: [2, 2, 6, 2], dims: [64, 128, 256, 512] # Nano: depths: [2, 2, 8, 2], dims: [80, 160, 320, 640] # Tiny: depths: [3, 3, 9, 3], dims: [96, 192, 384, 768] # Base: depths: [3, 3, 27, 3], dims: [128, 256, 512, 1024] # Large: depths: [3, 3, 27, 3], dims: [192, 384, 768, 1536] # Huge: depths: [3, 3, 27, 3], dims: [352, 704, 1408, 2816] depths: [2, 2, 8, 2] dims: [80, 160, 320, 640] input_h: 224 input_w: 224 ================================================ FILE: convnextv2/gen_wts.py ================================================ import torch import struct def gen_wts(model_path, wts_path): print(f"Loading {model_path}...") try: data = torch.load(model_path, map_location='cpu') except FileNotFoundError: print(f"Error: {model_path} not found.") return if isinstance(data, dict) and 'model' in data: state_dict = data['model'] else: state_dict = data print(f"Exporting to {wts_path}...") # Infer architecture dims = [] depths = [0, 0, 0, 0] # Check dimensions from downsample layers # downsample_layers.0.0 is stem: conv set output to dim[0] # downsample_layers.1.0 is conv: dim[0] -> dim[1] # ... if 'downsample_layers.0.0.weight' in state_dict: dims.append(state_dict['downsample_layers.0.0.weight'].shape[0]) if 'downsample_layers.1.0.weight' in state_dict: dims.append(state_dict['downsample_layers.1.0.weight'].shape[0]) if 'downsample_layers.2.0.weight' in state_dict: dims.append(state_dict['downsample_layers.2.0.weight'].shape[0]) if 'downsample_layers.3.0.weight' in state_dict: dims.append(state_dict['downsample_layers.3.0.weight'].shape[0]) # Count blocks per stage for k in state_dict.keys(): if k.startswith('stages.'): parts = k.split('.') if len(parts) >= 3: stage_idx = int(parts[1]) block_idx = int(parts[2]) if stage_idx < 4: depths[stage_idx] = max(depths[stage_idx], block_idx + 1) print("Inferred Architecture:") print(f" Dims: {dims}") print(f" Depths: {depths}") with open(wts_path, 'w') as f: f.write(f"{len(state_dict)}\n") for k, v in state_dict.items(): vr = v.reshape(-1).cpu().numpy() f.write(f"{k} {len(vr)}") for val in vr: f.write(" ") f.write(struct.pack('>f', float(val)).hex()) f.write("\n") print("Done.") if __name__ == "__main__": import sys if len(sys.argv) != 3: print(f"Usage: python {sys.argv[0]} ") print(f"Example: python {sys.argv[0]} models/test.pt convnextv2.wts") sys.exit(1) pt_path = sys.argv[1] wts_path = sys.argv[2] gen_wts(pt_path, wts_path) ================================================ FILE: convnextv2/inference.py ================================================ import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit # noqa: F401 import numpy as np import cv2 import ctypes import os import sys def load_imagenet_labels(label_file="imagenet_classes.txt"): """Load ImageNet class labels""" if not os.path.exists(label_file): return None with open(label_file, 'r') as f: labels = [line.strip() for line in f.readlines()] return labels def main(engine_path, img_path, label_file="imagenet_classes.txt"): # Load plugin library so_file = os.path.abspath("./build/liblayernorm_plugin.so") if not os.path.exists(so_file): print(f"Plugin library not found: {so_file}") return ctypes.CDLL(so_file) TRT_LOGGER = trt.Logger(trt.Logger.WARNING) runtime = trt.Runtime(TRT_LOGGER) if not os.path.exists(engine_path): print(f"Engine file not found: {engine_path}") return with open(engine_path, "rb") as f: serialized_engine = f.read() engine = runtime.deserialize_cuda_engine(serialized_engine) if not engine: print("Failed to deserialize engine.") return context = engine.create_execution_context() # Get Input Shape from Engine input_shape = (224, 224) # Default for i in range(engine.num_bindings): if engine.binding_is_input(i): shape = engine.get_binding_shape(i) # shape is usually (N, C, H, W) or (C, H, W) if len(shape) == 4: input_shape = (shape[2], shape[3]) elif len(shape) == 3: input_shape = (shape[1], shape[2]) break # Prepare input img = cv2.imread(img_path) if img is None: print(f"Failed to load image: {img_path}") return img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = cv2.resize(img, (input_shape[1], input_shape[0])) # cv2.resize takes (W, H) img = img.astype(np.float32) / 255.0 # ImageNet Mean/Std mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) img = (img - mean) / std img = img.transpose(2, 0, 1) # HWC -> CHW img = np.expand_dims(img, axis=0) # CHW -> NCHW img = np.ascontiguousarray(img) inputs, outputs, bindings, stream = [], [], [], cuda.Stream() for i in range(engine.num_bindings): dtype = trt.nptype(engine.get_binding_dtype(i)) shape = engine.get_binding_shape(i) # Handle dynamic shape or fixed # Check if input or output is_input = engine.binding_is_input(i) # Since we use explicit batch, shape[0] might be -1 or 1 # If -1, we set context binding shape if shape[0] == -1: shape = (1,) + shape[1:] context.set_binding_shape(i, shape) size = trt.volume(shape) * np.dtype(dtype).itemsize # Host memory host_mem = cuda.pagelocked_empty(trt.volume(shape), dtype) # Device memory device_mem = cuda.mem_alloc(size) bindings.append(int(device_mem)) if is_input: inputs.append({'host': host_mem, 'device': device_mem, 'shape': shape}) # Copy input data to host buffer np.copyto(host_mem, img.ravel()) else: outputs.append({'host': host_mem, 'device': device_mem, 'shape': shape}) # Inference # Transfer input data to the GPU. for inp in inputs: cuda.memcpy_htod_async(inp['device'], inp['host'], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. for out in outputs: cuda.memcpy_dtoh_async(out['host'], out['device'], stream) # Synchronize the stream stream.synchronize() # Process output labels = load_imagenet_labels(label_file) for out in outputs: output_data = out['host'] max_idx = np.argmax(output_data) max_val = output_data[max_idx] if labels: print(f"Predicted Class: {max_idx} - {labels[max_idx]} (Score: {max_val})") else: print(f"Predicted Class: {max_idx} (Score: {max_val})") if __name__ == "__main__": if len(sys.argv) < 3 or len(sys.argv) > 4: print(f"Usage: python {sys.argv[0]} [label_file]") print(f"Example: python {sys.argv[0]} convnextv2.engine images/test.jpg") print(f" python {sys.argv[0]} convnextv2.engine images/test.jpg custom_labels.txt") sys.exit(1) engine_path = sys.argv[1] img_path = sys.argv[2] label_file = sys.argv[3] if len(sys.argv) == 4 else "imagenet_classes.txt" main(engine_path, img_path, label_file) ================================================ FILE: convnextv2/src/LayerNormPlugin.cu ================================================ #include #include #include #include #include #include "LayerNormPlugin.h" using namespace nvinfer1; static const char* PLUGIN_NAME = "LayerNorm"; static const char* PLUGIN_VERSION = "1"; PluginFieldCollection LayerNormPluginCreator::mFC{}; std::vector LayerNormPluginCreator::mPluginAttributes; // Helper to check CUDA errors #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != 0) { \ std::cerr << "Cuda failure: " << ret << " at line " << __LINE__ << std::endl; \ abort(); \ } \ } while (0) template __device__ inline T epsilon(); template <> __device__ inline float epsilon() { return 1e-6f; } template <> __device__ inline half epsilon() { return (half)1e-6f; } // --- Kernel --- // Supports hidden_size up to 1024 with TPB=256, VPT=4 template __global__ void layerNormKernel(const T* __restrict__ input, const T* __restrict__ gamma, const T* __restrict__ beta, T* __restrict__ output, int hidden_size, float eps) { // blockIdx.x corresponds to one instance (one row of hidden_size elements) int row_offset = blockIdx.x * hidden_size; // Load data float vals[VPT]; #pragma unroll for (int i = 0; i < VPT; ++i) { int col = threadIdx.x * VPT + i; if (col < hidden_size) { vals[i] = (float)input[row_offset + col]; } else { vals[i] = 0.0f; } } // Compute mean float thread_sum = 0.0f; #pragma unroll for (int i = 0; i < VPT; ++i) { if (threadIdx.x * VPT + i < hidden_size) thread_sum += vals[i]; } using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; float sum = BlockReduce(temp_storage).Sum(thread_sum); __shared__ float mean; if (threadIdx.x == 0) mean = sum / hidden_size; __syncthreads(); // Compute variance float thread_sq_diff = 0.0f; #pragma unroll for (int i = 0; i < VPT; ++i) { if (threadIdx.x * VPT + i < hidden_size) { float diff = vals[i] - mean; thread_sq_diff += diff * diff; } } float sq_diff_sum = BlockReduce(temp_storage).Sum(thread_sq_diff); __shared__ float inv_std; if (threadIdx.x == 0) { inv_std = rsqrtf((sq_diff_sum / hidden_size) + eps); } __syncthreads(); // Normalize and scale #pragma unroll for (int i = 0; i < VPT; ++i) { int col = threadIdx.x * VPT + i; if (col < hidden_size) { float val = (vals[i] - mean) * inv_std; float g = (float)gamma[col]; float b = (float)beta[col]; output[row_offset + col] = (T)(val * g + b); } } } // --- Plugin Implementation --- LayerNormPlugin::LayerNormPlugin(const std::string& name, float epsilon, int hidden_size) : mName(name), mEpsilon(epsilon), mHiddenSize(hidden_size) {} LayerNormPlugin::LayerNormPlugin(const std::string& name, const void* data, size_t length) : mName(name) { const char* d = static_cast(data); const char* a = d; mEpsilon = *reinterpret_cast(d); d += sizeof(float); mHiddenSize = *reinterpret_cast(d); d += sizeof(int); assert(d == a + length); } LayerNormPlugin::~LayerNormPlugin() {} IPluginV2DynamicExt* LayerNormPlugin::clone() const noexcept { auto p = new LayerNormPlugin(mName, mEpsilon, mHiddenSize); p->setPluginNamespace(mNamespace.c_str()); return p; } int32_t LayerNormPlugin::getNbOutputs() const noexcept { return 1; } DataType LayerNormPlugin::getOutputDataType(int32_t index, const DataType* inputTypes, int32_t nbInputs) const noexcept { return inputTypes[0]; } DimsExprs LayerNormPlugin::getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept { return inputs[0]; } bool LayerNormPlugin::supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept { if (pos == 0) { // Input return (inOut[0].type == DataType::kFLOAT || inOut[0].type == DataType::kHALF) && inOut[0].format == TensorFormat::kLINEAR; } if (pos == 1 || pos == 2) { // Gamma, Beta return inOut[pos].type == inOut[0].type && inOut[pos].format == TensorFormat::kLINEAR; } if (pos == 3) { // Output return inOut[pos].type == inOut[0].type && inOut[pos].format == TensorFormat::kLINEAR; } return false; } void LayerNormPlugin::configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs, const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept { // Validate inputs mHiddenSize = in[0].desc.dims.d[in[0].desc.dims.nbDims - 1]; } size_t LayerNormPlugin::getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs, const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept { return 0; } int32_t LayerNormPlugin::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { int total = 1; for (int i = 0; i < inputDesc[0].dims.nbDims; ++i) total *= inputDesc[0].dims.d[i]; int rows = total / mHiddenSize; if (inputDesc[0].type == DataType::kFLOAT) { layerNormKernel<<>>((const float*)inputs[0], (const float*)inputs[1], (const float*)inputs[2], (float*)outputs[0], mHiddenSize, mEpsilon); } else { layerNormKernel<<>>((const half*)inputs[0], (const half*)inputs[1], (const half*)inputs[2], (half*)outputs[0], mHiddenSize, mEpsilon); } return 0; } const char* LayerNormPlugin::getPluginType() const noexcept { return PLUGIN_NAME; } const char* LayerNormPlugin::getPluginVersion() const noexcept { return PLUGIN_VERSION; } void LayerNormPlugin::destroy() noexcept { delete this; } int32_t LayerNormPlugin::initialize() noexcept { return 0; } void LayerNormPlugin::terminate() noexcept {} size_t LayerNormPlugin::getSerializationSize() const noexcept { return sizeof(float) + sizeof(int); } void LayerNormPlugin::serialize(void* buffer) const noexcept { char* d = static_cast(buffer); *reinterpret_cast(d) = mEpsilon; d += sizeof(float); *reinterpret_cast(d) = mHiddenSize; d += sizeof(int); } void LayerNormPlugin::setPluginNamespace(const char* libNamespace) noexcept { mNamespace = libNamespace; } const char* LayerNormPlugin::getPluginNamespace() const noexcept { return mNamespace.c_str(); } // --- Creator Implementation --- LayerNormPluginCreator::LayerNormPluginCreator() { mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } LayerNormPluginCreator::~LayerNormPluginCreator() {} const char* LayerNormPluginCreator::getPluginName() const noexcept { return PLUGIN_NAME; } const char* LayerNormPluginCreator::getPluginVersion() const noexcept { return PLUGIN_VERSION; } const PluginFieldCollection* LayerNormPluginCreator::getFieldNames() noexcept { return &mFC; } IPluginV2* LayerNormPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept { float epsilon = 1e-6f; for (int i = 0; i < fc->nbFields; ++i) { if (strcmp(fc->fields[i].name, "epsilon") == 0) { epsilon = *static_cast(fc->fields[i].data); } } return new LayerNormPlugin(name, epsilon, 0); // hidden_size will be set in configure } IPluginV2* LayerNormPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept { return new LayerNormPlugin(name, serialData, serialLength); } void LayerNormPluginCreator::setPluginNamespace(const char* libNamespace) noexcept { mNamespace = libNamespace; } const char* LayerNormPluginCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } REGISTER_TENSORRT_PLUGIN(LayerNormPluginCreator); ================================================ FILE: convnextv2/src/LayerNormPlugin.h ================================================ #ifndef LAYER_NORM_PLUGIN_H #define LAYER_NORM_PLUGIN_H #include #include #include using namespace nvinfer1; class LayerNormPlugin : public IPluginV2DynamicExt { public: LayerNormPlugin(const std::string& name, float epsilon, int hidden_size); LayerNormPlugin(const std::string& name, const void* data, size_t length); LayerNormPlugin() = delete; ~LayerNormPlugin() override; // IPluginV2DynamicExt Methods IPluginV2DynamicExt* clone() const noexcept override; int32_t getNbOutputs() const noexcept override; DataType getOutputDataType(int32_t index, const DataType* inputTypes, int32_t nbInputs) const noexcept override; DimsExprs getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override; bool supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; void configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs, const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept override; size_t getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs, const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept override; int32_t enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; // IPluginV2 Methods const char* getPluginType() const noexcept override; const char* getPluginVersion() const noexcept override; void destroy() noexcept override; int32_t initialize() noexcept override; void terminate() noexcept override; size_t getSerializationSize() const noexcept override; void serialize(void* buffer) const noexcept override; void setPluginNamespace(const char* pluginNamespace) noexcept override; const char* getPluginNamespace() const noexcept override; private: std::string mName; std::string mNamespace; float mEpsilon; int mHiddenSize; // Number of channels }; class LayerNormPluginCreator : public IPluginCreator { public: LayerNormPluginCreator(); ~LayerNormPluginCreator() override; const char* getPluginName() const noexcept override; const char* getPluginVersion() const noexcept override; const PluginFieldCollection* getFieldNames() noexcept override; IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override; IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override; void setPluginNamespace(const char* pluginNamespace) noexcept override; const char* getPluginNamespace() const noexcept override; private: static PluginFieldCollection mFC; static std::vector mPluginAttributes; std::string mNamespace; }; #endif // LAYER_NORM_PLUGIN_H ================================================ FILE: convnextv2/src/convnextv2.cpp ================================================ #include #include #include #include #include #include #include #include #include #include "LayerNormPlugin.h" #include "NvInfer.h" #include "logging.h" static const char* INPUT_BLOB_NAME = "data"; static const char* OUTPUT_BLOB_NAME = "output"; struct ConvNextConfig { int depths[4]; int dims[4]; int input_h; int input_w; }; // Simple parser for YAML-like config (key: [v1, v2..] or key: value) ConvNextConfig loadConfig(const std::string& configPath) { ConvNextConfig cfg; // Default to Nano cfg.depths[0] = 2; cfg.depths[1] = 2; cfg.depths[2] = 8; cfg.depths[3] = 2; cfg.dims[0] = 80; cfg.dims[1] = 160; cfg.dims[2] = 320; cfg.dims[3] = 640; cfg.input_h = 224; cfg.input_w = 224; std::ifstream file(configPath); if (!file.is_open()) { std::cerr << "Warning: Could not open config file " << configPath << ". Using default Nano config." << std::endl; return cfg; } std::string line; while (std::getline(file, line)) { if (line.empty() || line[0] == '#') continue; std::stringstream ss(line); std::string key; std::getline(ss, key, ':'); // Trim key key.erase(0, key.find_first_not_of(" \t")); key.erase(key.find_last_not_of(" \t") + 1); if (key == "depths" || key == "dims") { // format: [v1, v2, v3, v4] std::string valStr; std::getline(ss, valStr); // Simple parse: remove [ ] and split by , size_t start = valStr.find('['); size_t end = valStr.find(']'); if (start != std::string::npos && end != std::string::npos) { std::string nums = valStr.substr(start + 1, end - start - 1); std::stringstream ssNums(nums); std::string segment; int idx = 0; while (std::getline(ssNums, segment, ',') && idx < 4) { if (key == "depths") cfg.depths[idx++] = std::stoi(segment); else cfg.dims[idx++] = std::stoi(segment); } } } else if (key == "input_h") { int val; ss >> val; cfg.input_h = val; } else if (key == "input_w") { int val; ss >> val; cfg.input_w = val; } } std::cout << "Loaded Config - Depths: [" << cfg.depths[0] << "," << cfg.depths[1] << "," << cfg.depths[2] << "," << cfg.depths[3] << "]" << " Dims: [" << cfg.dims[0] << "," << cfg.dims[1] << "," << cfg.dims[2] << "," << cfg.dims[3] << "]" << " Input: " << cfg.input_h << "x" << cfg.input_w << std::endl; return cfg; } // Global config static ConvNextConfig g_config; // Macros/Consts replaced by g_config members #define DEPTHS g_config.depths #define DIMS g_config.dims #define INPUT_H g_config.input_h #define INPUT_W g_config.input_w using namespace nvinfer1; static Logger gLogger; // Global variables for paths std::string g_wts_path = "convnextv2.wts"; std::string g_engine_path = "convnextv2.engine"; // Weights utils std::map loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; uint32_t* val = new uint32_t[size]; for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition* network, ITensor& input, std::map& weightMap, std::string name, float eps) { float* gamma = (float*)weightMap[name + ".weight"].values; float* beta = (float*)weightMap[name + ".bias"].values; float* mean = (float*)weightMap[name + ".running_mean"].values; float* var = (float*)weightMap[name + ".running_var"].values; int len = weightMap[name + ".running_var"].count; float* scval = new float[len]; float* shval = new float[len]; float* pval = new float[len]; for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); pval[i] = 1.0; } Weights wsc{DataType::kFLOAT, scval, len}; Weights wsh{DataType::kFLOAT, shval, len}; Weights wpower{DataType::kFLOAT, pval, len}; IScaleLayer* scale = network->addScale(input, ScaleMode::kCHANNEL, wsh, wsc, wpower); assert(scale); return scale; } ITensor* convNextBlock(INetworkDefinition* network, ITensor* input, int dim, std::string name, std::map& weightMap) { // Input is NCHW // 1. DWConv 7x7 Weights empty{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* dwconv = network->addConvolutionNd(*input, dim, DimsHW{7, 7}, weightMap[name + ".dwconv.weight"], weightMap[name + ".dwconv.bias"]); assert(dwconv); dwconv->setStrideNd(DimsHW{1, 1}); dwconv->setPaddingNd(DimsHW{3, 3}); dwconv->setNbGroups(dim); ITensor* x = dwconv->getOutput(0); // 2. Permute NCHW -> NHWC for LayerNorm IShuffleLayer* p1 = network->addShuffle(*x); p1->setSecondTranspose({0, 2, 3, 1}); x = p1->getOutput(0); // 3. LayerNorm (Plugin) auto creator = getPluginRegistry()->getPluginCreator("LayerNorm", "1"); PluginFieldCollection pfc; float eps = 1e-6f; PluginField pf("epsilon", &eps, PluginFieldType::kFLOAT32, 1); pfc.nbFields = 1; pfc.fields = &pf; IPluginV2* plugin = creator->createPlugin(name.c_str(), &pfc); // Pass gamma/beta (1D of size C) as plugin inputs along with x (N,H,W,C) auto w_ln_w = weightMap[name + ".norm.weight"]; auto w_ln_b = weightMap[name + ".norm.bias"]; IConstantLayer* c_gamma = network->addConstant(Dims{1, {w_ln_w.count}}, w_ln_w); IConstantLayer* c_beta = network->addConstant(Dims{1, {w_ln_b.count}}, w_ln_b); ITensor* inputs[] = {x, c_gamma->getOutput(0), c_beta->getOutput(0)}; IPluginV2Layer* ln = network->addPluginV2(inputs, 3, *plugin); x = ln->getOutput(0); // 4. Permute NHWC -> NCHW IShuffleLayer* p2 = network->addShuffle(*x); p2->setSecondTranspose({0, 3, 1, 2}); x = p2->getOutput(0); // 5. PWConv1 (1x1) IConvolutionLayer* pw1 = network->addConvolutionNd(*x, 4 * dim, DimsHW{1, 1}, weightMap[name + ".pwconv1.weight"], weightMap[name + ".pwconv1.bias"]); x = pw1->getOutput(0); // 6. GELU // Manual GELU implementation: 0.5 * x * (1 + erf(x / sqrt(2))) float* sqrt2_inv = new float[1]; *sqrt2_inv = 1.0f / std::sqrt(2.0f); Weights w_sqrt2{DataType::kFLOAT, sqrt2_inv, 1}; IConstantLayer* c_sqrt2 = network->addConstant(Dims4{1, 1, 1, 1}, w_sqrt2); // Broadcast IElementWiseLayer* div = network->addElementWise(*x, *c_sqrt2->getOutput(0), ElementWiseOperation::kPROD); IUnaryLayer* erf = network->addUnary(*div->getOutput(0), UnaryOperation::kERF); float* one = new float[1]; *one = 1.0f; Weights w_one{DataType::kFLOAT, one, 1}; IConstantLayer* c_one = network->addConstant(Dims4{1, 1, 1, 1}, w_one); IElementWiseLayer* add_erf = network->addElementWise(*erf->getOutput(0), *c_one->getOutput(0), ElementWiseOperation::kSUM); float* half = new float[1]; *half = 0.5f; Weights w_half{DataType::kFLOAT, half, 1}; IConstantLayer* c_half = network->addConstant(Dims4{1, 1, 1, 1}, w_half); IElementWiseLayer* mul_half = network->addElementWise(*x, *c_half->getOutput(0), ElementWiseOperation::kPROD); IElementWiseLayer* gelu = network->addElementWise(*mul_half->getOutput(0), *add_erf->getOutput(0), ElementWiseOperation::kPROD); x = gelu->getOutput(0); // 7. GRN (implemented in NCHW). X shape: [N, 4*dim, H, W], gx -> [N, C, 1, 1] // x*x IElementWiseLayer* sq = network->addElementWise(*x, *x, ElementWiseOperation::kPROD); ITensor* x_sq = sq->getOutput(0); // Sum over H,W (axes 2, 3 = 4 | 8 = 12) IReduceLayer* red_sum = network->addReduce(*x_sq, ReduceOperation::kSUM, 12, true); ITensor* sum_x = red_sum->getOutput(0); // Sqrt IUnaryLayer* sqrt_layer = network->addUnary(*sum_x, UnaryOperation::kSQRT); ITensor* gx = sqrt_layer->getOutput(0); // [N, C, 1, 1] // Normalize GRN: nx = gx / (mean(gx, dim=1) + eps) // Mean over C (axis 1) IReduceLayer* red_mean = network->addReduce(*gx, ReduceOperation::kAVG, 2, true); // bit 1 set -> axis 1 ITensor* mean_gx = red_mean->getOutput(0); // [N, 1, 1, 1] // Add eps float eps_val = 1e-6f; Weights w_eps{DataType::kFLOAT, &eps_val, 1}; // Creating scalar constant [1,1,1,1] float* eps_ptr = new float[1]; eps_ptr[0] = 1e-6f; Weights eps_w{DataType::kFLOAT, eps_ptr, 1}; IConstantLayer* c_eps = network->addConstant(Dims4{1, 1, 1, 1}, eps_w); IElementWiseLayer* add_eps = network->addElementWise(*mean_gx, *c_eps->getOutput(0), ElementWiseOperation::kSUM); ITensor* denom = add_eps->getOutput(0); // Div IElementWiseLayer* div_grn = network->addElementWise(*gx, *denom, ElementWiseOperation::kDIV); ITensor* nx = div_grn->getOutput(0); // [N, C, 1, 1] // Scale X by nx IElementWiseLayer* scale_x = network->addElementWise(*x, *nx, ElementWiseOperation::kPROD); ITensor* x_norm = scale_x->getOutput(0); // Apply Gamma/Beta for GRN (channel-wise scale) then add residual from GELU input Weights w_grn_g = weightMap[name + ".grn.gamma"]; Weights w_grn_b = weightMap[name + ".grn.beta"]; Weights w_power{DataType::kFLOAT, nullptr, 0}; IScaleLayer* grn_scale = network->addScale(*x_norm, ScaleMode::kCHANNEL, w_grn_b, w_grn_g, w_power); x = grn_scale->getOutput(0); // Residual: x = grn_scaled + gelu_output ITensor* x_in = gelu->getOutput(0); IElementWiseLayer* add_grn = network->addElementWise(*x, *x_in, ElementWiseOperation::kSUM); x = add_grn->getOutput(0); // 8. PWConv2 (1x1) IConvolutionLayer* pw2 = network->addConvolutionNd(*x, dim, DimsHW{1, 1}, weightMap[name + ".pwconv2.weight"], weightMap[name + ".pwconv2.bias"]); x = pw2->getOutput(0); // 9. DropPath (Ignored in inference) // 10. Residual IElementWiseLayer* res = network->addElementWise(*input, *x, ElementWiseOperation::kSUM); return res->getOutput(0); } ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition* network = builder->createNetworkV2(explicitBatch); // Create input ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{maxBatchSize, 3, INPUT_H, INPUT_W}); assert(data); // Load weights from the path provided via command line (g_wts_path) std::map weightMap = loadWeights(g_wts_path); // Initialize Stem // downsample_layers.0: Conv 4x4, s=4 -> LN // Conv IConvolutionLayer* conv0 = network->addConvolutionNd(*data, DIMS[0], DimsHW{4, 4}, weightMap["downsample_layers.0.0.weight"], weightMap["downsample_layers.0.0.bias"]); assert(conv0); conv0->setStrideNd(DimsHW{4, 4}); ITensor* x = conv0->getOutput(0); // LN // Transpose to NHWC IShuffleLayer* p0 = network->addShuffle(*x); p0->setSecondTranspose({0, 2, 3, 1}); x = p0->getOutput(0); // Plugin auto creator = getPluginRegistry()->getPluginCreator("LayerNorm", "1"); PluginFieldCollection pfc; float eps = 1e-6f; PluginField pf("epsilon", &eps, PluginFieldType::kFLOAT32, 1); pfc.nbFields = 1; pfc.fields = &pf; IPluginV2* plugin = creator->createPlugin("stem_ln", &pfc); auto w_ln0_w = weightMap["downsample_layers.0.1.weight"]; auto w_ln0_b = weightMap["downsample_layers.0.1.bias"]; IConstantLayer* c_g0 = network->addConstant(Dims{1, {w_ln0_w.count}}, w_ln0_w); IConstantLayer* c_b0 = network->addConstant(Dims{1, {w_ln0_b.count}}, w_ln0_b); ITensor* in0[] = {x, c_g0->getOutput(0), c_b0->getOutput(0)}; IPluginV2Layer* ln0 = network->addPluginV2(in0, 3, *plugin); x = ln0->getOutput(0); // Transpose back IShuffleLayer* p0_back = network->addShuffle(*x); p0_back->setSecondTranspose({0, 3, 1, 2}); x = p0_back->getOutput(0); // Stages for (int i = 0; i < 4; i++) { // Downsample layer (except first stage which is stem) if (i > 0) { std::string ds_name = "downsample_layers." + std::to_string(i); // LN -> Conv 2x2 s=2 // LN (NHWC) IShuffleLayer* p_ds = network->addShuffle(*x); p_ds->setSecondTranspose({0, 2, 3, 1}); x = p_ds->getOutput(0); auto creator = getPluginRegistry()->getPluginCreator("LayerNorm", "1"); PluginFieldCollection pfc_ds; float eps_ds = 1e-6f; PluginField pf_ds("epsilon", &eps_ds, PluginFieldType::kFLOAT32, 1); pfc_ds.nbFields = 1; pfc_ds.fields = &pf_ds; IPluginV2* plugin_ds = creator->createPlugin((ds_name + "_ln").c_str(), &pfc_ds); auto w_ds_w = weightMap[ds_name + ".0.weight"]; auto w_ds_b = weightMap[ds_name + ".0.bias"]; IConstantLayer* c_ds_g = network->addConstant(Dims{1, {w_ds_w.count}}, w_ds_w); IConstantLayer* c_ds_b = network->addConstant(Dims{1, {w_ds_b.count}}, w_ds_b); ITensor* in_ds[] = {x, c_ds_g->getOutput(0), c_ds_b->getOutput(0)}; IPluginV2Layer* ln_ds = network->addPluginV2(in_ds, 3, *plugin_ds); x = ln_ds->getOutput(0); IShuffleLayer* p_ds_back = network->addShuffle(*x); p_ds_back->setSecondTranspose({0, 3, 1, 2}); x = p_ds_back->getOutput(0); // Conv 2x2, s=2 IConvolutionLayer* conv_ds = network->addConvolutionNd( *x, DIMS[i], DimsHW{2, 2}, weightMap[ds_name + ".1.weight"], weightMap[ds_name + ".1.bias"]); conv_ds->setStrideNd(DimsHW{2, 2}); x = conv_ds->getOutput(0); } // Blocks for (int j = 0; j < DEPTHS[i]; j++) { std::string block_name = "stages." + std::to_string(i) + "." + std::to_string(j); x = convNextBlock(network, x, DIMS[i], block_name, weightMap); } } // Final Norm (Global Avg Pooling -> LayerNorm -> Head) // Global Avg Pooling IReduceLayer* gap = network->addReduce(*x, ReduceOperation::kAVG, 12, true); // sum H,W (indices 2,3) x = gap->getOutput(0); // [N, C, 1, 1] // Reshape to [N,1,1,C] so LayerNorm plugin sees channels as last dimension IShuffleLayer* p_fin = network->addShuffle(*x); p_fin->setReshapeDimensions(Dims4{maxBatchSize, 1, 1, DIMS[3]}); x = p_fin->getOutput(0); auto creator_fin = getPluginRegistry()->getPluginCreator("LayerNorm", "1"); PluginFieldCollection pfc_fin; float eps_fin = 1e-6f; PluginField pf_fin("epsilon", &eps_fin, PluginFieldType::kFLOAT32, 1); pfc_fin.nbFields = 1; pfc_fin.fields = &pf_fin; IPluginV2* plugin_fin = creator_fin->createPlugin("final_norm", &pfc_fin); // norm.weight / norm.bias auto w_fn_w = weightMap["norm.weight"]; auto w_fn_b = weightMap["norm.bias"]; IConstantLayer* c_fn_g = network->addConstant(Dims{1, {w_fn_w.count}}, w_fn_w); IConstantLayer* c_fn_b = network->addConstant(Dims{1, {w_fn_b.count}}, w_fn_b); ITensor* in_fn[] = {x, c_fn_g->getOutput(0), c_fn_b->getOutput(0)}; IPluginV2Layer* ln_fn = network->addPluginV2(in_fn, 3, *plugin_fin); x = ln_fn->getOutput(0); // Reshape back to [N, C, 1, 1] for 1x1 conv. IShuffleLayer* p_fin_b = network->addShuffle(*x); p_fin_b->setReshapeDimensions(Dims4{maxBatchSize, DIMS[3], 1, 1}); x = p_fin_b->getOutput(0); Weights head_w = weightMap["head.weight"]; Weights head_b = weightMap["head.bias"]; // Check num classes int num_classes = head_w.count / DIMS[3]; IConvolutionLayer* head = network->addConvolutionNd(*x, num_classes, DimsHW{1, 1}, head_w, head_b); x = head->getOutput(0); x->setName(OUTPUT_BLOB_NAME); network->markOutput(*x); // Build engine builder->setMaxBatchSize(maxBatchSize); // Workspace size configured below depending on TRT version #if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) >= 86 // setMemoryPoolLimit config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 30); // 1GB #else config->setMaxWorkspaceSize(1 << 30); // 1GB #endif ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); delete network; return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); (*modelStream) = engine->serialize(); engine->destroy(); config->destroy(); builder->destroy(); } void inference(const std::string& engine_file, const std::string& image_file) { std::cout << "Running inference..." << std::endl; std::ifstream file(engine_file, std::ios::binary); if (!file.good()) { std::cerr << "Error: Engine file not found" << std::endl; return; } file.seekg(0, file.end); size_t size = file.tellg(); file.seekg(0, file.beg); char* trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Load image cv::Mat img = cv::imread(image_file); if (img.empty()) { std::cerr << "Error: Image not found" << std::endl; return; } cv::resize(img, img, cv::Size(INPUT_W, INPUT_H)); img.convertTo(img, CV_32F); // Normalize (Mean [0.485, 0.456, 0.406], Std [0.229, 0.224, 0.225]) // OpenCV is BGR. Pytorch expects RGB. cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img /= 255.0; float mean[] = {0.485, 0.456, 0.406}; float std[] = {0.229, 0.224, 0.225}; // HWC -> NCHW and Normalize float* hostData = new float[3 * INPUT_H * INPUT_W]; for (int h = 0; h < INPUT_H; ++h) { for (int w = 0; w < INPUT_W; ++w) { for (int c = 0; c < 3; ++c) { float val = img.at(h, w)[c]; hostData[c * INPUT_H * INPUT_W + h * INPUT_W + w] = (val - mean[c]) / std[c]; } } } void* deviceData; cudaMalloc(&deviceData, 3 * INPUT_H * INPUT_W * sizeof(float)); cudaMemcpy(deviceData, hostData, 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice); // Output buffer // Determine output size. int outputSize = 1000; // Default ImageNet // Check binding dimensions int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); Dims outDims = engine->getBindingDimensions(outputIndex); // outputSize = outDims.d[1]; float* hostOutput = new float[outputSize]; void* deviceOutput; cudaMalloc(&deviceOutput, outputSize * sizeof(float)); void* bindings[] = {deviceData, deviceOutput}; // Execute context->executeV2(bindings); // Copy back cudaMemcpy(hostOutput, deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost); // Softmax and Argmax float maxVal = -1e9; int maxIdx = -1; for (int i = 0; i < outputSize; ++i) { if (hostOutput[i] > maxVal) { maxVal = hostOutput[i]; maxIdx = i; } } std::cout << "Predicted Class: " << maxIdx << " (Score: " << maxVal << ")" << std::endl; cudaFree(deviceData); cudaFree(deviceOutput); delete[] hostData; delete[] hostOutput; delete context; delete engine; delete runtime; } int main(int argc, char** argv) { if (argc < 3) { std::cerr << "Usage: " << argv[0] << " [config_path]" << std::endl; std::cerr << "Example: " << argv[0] << " convnextv2.wts convnextv2.engine config.yaml" << std::endl; return -1; } g_wts_path = argv[1]; g_engine_path = argv[2]; std::string config_path = (argc >= 4) ? argv[3] : "config.yaml"; g_config = loadConfig(config_path); // Register Plugin manually if needed auto* lnCreator = new LayerNormPluginCreator(); getPluginRegistry()->registerCreator(*lnCreator, ""); // Generate engine IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p(g_engine_path, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); std::cout << "Engine generated successfully: " << g_engine_path << std::endl; return 0; } ================================================ FILE: convnextv2/src/inference_cpp.cpp ================================================ #include #include #include #include #include #include "LayerNormPlugin.h" #include "NvInfer.h" #include "logging.h" using namespace nvinfer1; static Logger gLogger; std::vector load_imagenet_labels(const std::string& label_file = "imagenet_classes.txt") { std::vector labels; std::ifstream file(label_file); if (!file.is_open()) { return labels; } std::string line; while (std::getline(file, line)) { labels.push_back(line); } return labels; } static const char* INPUT_BLOB_NAME = "data"; static const char* OUTPUT_BLOB_NAME = "prob"; void inference(const std::string& engine_file, const std::string& image_file, const std::string& label_file = "imagenet_classes.txt") { std::cout << "Running inference..." << std::endl; // Register LayerNorm plugin static LayerNormPluginCreator pluginCreator; getPluginRegistry()->registerCreator(pluginCreator, ""); std::ifstream file(engine_file, std::ios::binary); if (!file.good()) { std::cerr << "Error: Engine file not found: " << engine_file << std::endl; return; } file.seekg(0, file.end); size_t size = file.tellg(); file.seekg(0, file.beg); char* trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Determine dimensions from engine int inputIndex = -1; int outputIndex = -1; for (int i = 0; i < engine->getNbBindings(); ++i) { if (engine->bindingIsInput(i)) { inputIndex = i; } else { outputIndex = i; } } if (inputIndex == -1 || outputIndex == -1) { std::cerr << "Error: Could not find input or output bindings in engine." << std::endl; return; } Dims inputDims = engine->getBindingDimensions(inputIndex); Dims outputDims = engine->getBindingDimensions(outputIndex); // Assuming NCHW format for input int input_h = inputDims.d[2]; int input_w = inputDims.d[3]; int input_c = inputDims.d[1]; // Usually 3 // Assuming N x NumClasses or just NumClasses int outputSize = 1; for (int i = 0; i < outputDims.nbDims; ++i) { // Skip batch dimension if it is dynamic (-1) or 1 if (i == 0 && (outputDims.d[i] == -1 || outputDims.d[i] == 1)) continue; outputSize *= outputDims.d[i]; } std::cout << "Input Dimensions: " << input_c << "x" << input_h << "x" << input_w << std::endl; std::cout << "Output Size: " << outputSize << std::endl; // Load image cv::Mat img = cv::imread(image_file); if (img.empty()) { std::cerr << "Error: Image not found: " << image_file << std::endl; return; } cv::resize(img, img, cv::Size(input_w, input_h)); img.convertTo(img, CV_32F); // Normalize (Mean [0.485, 0.456, 0.406], Std [0.229, 0.224, 0.225]) // OpenCV is BGR. Pytorch expects RGB. cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img /= 255.0; float mean[] = {0.485, 0.456, 0.406}; float std[] = {0.229, 0.224, 0.225}; // HWC -> NCHW and Normalize float* hostData = new float[input_c * input_h * input_w]; for (int h = 0; h < input_h; ++h) { for (int w = 0; w < input_w; ++w) { for (int c = 0; c < input_c; ++c) { float val = img.at(h, w)[c]; hostData[c * input_h * input_w + h * input_w + w] = (val - mean[c]) / std[c]; } } } void* deviceData; cudaMalloc(&deviceData, input_c * input_h * input_w * sizeof(float)); cudaMemcpy(deviceData, hostData, input_c * input_h * input_w * sizeof(float), cudaMemcpyHostToDevice); // Output buffer float* hostOutput = new float[outputSize]; void* deviceOutput; cudaMalloc(&deviceOutput, outputSize * sizeof(float)); void* bindings[] = {deviceData, deviceOutput}; if (engine->getBindingIndex(INPUT_BLOB_NAME) != 0) { bindings[inputIndex] = deviceData; bindings[outputIndex] = deviceOutput; } // Execute context->executeV2(bindings); // Copy back cudaMemcpy(hostOutput, deviceOutput, outputSize * sizeof(float), cudaMemcpyDeviceToHost); // Argmax float maxVal = -1e9; int maxIdx = -1; for (int i = 0; i < outputSize; ++i) { if (hostOutput[i] > maxVal) { maxVal = hostOutput[i]; maxIdx = i; } } auto labels = load_imagenet_labels(label_file); if (!labels.empty() && maxIdx < static_cast(labels.size())) { std::cout << "Predicted Class: " << maxIdx << " - " << labels[maxIdx] << " (Score: " << maxVal << ")" << std::endl; } else { std::cout << "Predicted Class: " << maxIdx << " (Score: " << maxVal << ")" << std::endl; } cudaFree(deviceData); cudaFree(deviceOutput); delete[] hostData; delete[] hostOutput; delete context; delete engine; delete runtime; } int main(int argc, char** argv) { if (argc < 3 || argc > 4) { std::cerr << "Usage: " << argv[0] << " [label_file]" << std::endl; std::cerr << "Example: " << argv[0] << " convnextv2.engine images/test.jpg" << std::endl; std::cerr << " " << argv[0] << " convnextv2.engine images/test.jpg custom_labels.txt" << std::endl; return -1; } std::string engine_path = argv[1]; std::string image_path = argv[2]; std::string label_file = (argc == 4) ? argv[3] : "imagenet_classes.txt"; inference(engine_path, image_path, label_file); return 0; } ================================================ FILE: convnextv2/src/logging.h ================================================ #ifndef LOGGING_H #define LOGGING_H #include #include using namespace nvinfer1; class Logger : public ILogger { public: Logger(Severity severity = Severity::kINFO) : reportableSeverity(severity) {} void log(Severity severity, const char* msg) noexcept override { if (severity > reportableSeverity) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cout << "INFO: "; break; default: std::cout << "VERBOSE: "; break; } std::cout << msg << std::endl; } Severity reportableSeverity; }; #endif ================================================ FILE: crnn/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(crnn) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) endif() find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(crnn ${PROJECT_SOURCE_DIR}/crnn.cpp) target_link_libraries(crnn nvinfer) target_link_libraries(crnn cudart) target_link_libraries(crnn ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: crnn/README.md ================================================ # crnn The Pytorch implementation is [meijieru/crnn.pytorch](https://github.com/meijieru/crnn.pytorch). ## How to Run ``` 1. generate crnn.wts from pytorch git clone https://github.com/wang-xinyu/tensorrtx.git git clone https://github.com/meijieru/crnn.pytorch.git // download its weights 'crnn.pth' // copy tensorrtx/crnn/genwts.py into crnn.pytorch/ // go to crnn.pytorch/ python genwts.py // a file 'crnn.wts' will be generated. 2. build tensorrtx/crnn and run // put crnn.wts into tensorrtx/crnn // go to tensorrtx/crnn mkdir build cd build cmake .. make sudo ./crnn -s // serialize model to plan file i.e. 'crnn.engine' // copy crnn.pytorch/data/demo.png here sudo ./crnn -d // deserialize plan file and run inference 3. check the output as follows: raw: a-----v--a-i-l-a-bb-l-e--- sim: available ``` ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ## Acknowledgment Thanks for the donation for this crnn tensorrt implementation from @雍. ================================================ FILE: crnn/crnn.cpp ================================================ #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // stuff we know about the network and the input/output blobs static const int INPUT_H = 32; static const int INPUT_W = 100; static const int OUTPUT_SIZE = 26 * 37; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; const int ks[] = {3, 3, 3, 3, 3, 3, 2}; const int ps[] = {1, 1, 1, 1, 1, 1, 0}; const int ss[] = {1, 1, 1, 1, 1, 1, 1}; const int nm[] = {64, 128, 256, 256, 512, 512, 512}; const std::string alphabet = "-0123456789abcdefghijklmnopqrstuvwxyz"; using namespace nvinfer1; std::string strDecode(std::vector& preds, bool raw) { std::string str; if (raw) { for (auto v: preds) { str.push_back(alphabet[v]); } } else { for (size_t i = 0; i < preds.size(); i++) { if (preds[i] == 0 || (i > 0 && preds[i - 1] == preds[i])) continue; str.push_back(alphabet[preds[i]]); } } return str; } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, int i, bool use_bn = false) { int nOut = nm[i]; IConvolutionLayer* conv = network->addConvolutionNd(input, nOut, DimsHW{ks[i], ks[i]}, weightMap["cnn.conv" + std::to_string(i) + ".weight"], weightMap["cnn.conv" + std::to_string(i) + ".bias"]); assert(conv); conv->setStrideNd(DimsHW{ss[i], ss[i]}); conv->setPaddingNd(DimsHW{ps[i], ps[i]}); ILayer *tmp = conv; if (use_bn) { tmp = addBatchNorm2d(network, weightMap, *conv->getOutput(0), "cnn.batchnorm" + std::to_string(i), 1e-5); } auto relu = network->addActivation(*tmp->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } void splitLstmWeights(std::map& weightMap, std::string lname) { int weight_size = weightMap[lname].count; for (int i = 0; i < 4; i++) { Weights wt{DataType::kFLOAT, nullptr, 0}; wt.count = weight_size / 4; float *val = reinterpret_cast(malloc(sizeof(float) * wt.count)); memcpy(val, (float*)weightMap[lname].values + wt.count * i, sizeof(float) * wt.count); wt.values = val; weightMap[lname + std::to_string(i)] = wt; } } ILayer* addLSTM(INetworkDefinition *network, std::map& weightMap, ITensor& input, int nHidden, std::string lname) { splitLstmWeights(weightMap, lname + ".weight_ih_l0"); splitLstmWeights(weightMap, lname + ".weight_hh_l0"); splitLstmWeights(weightMap, lname + ".bias_ih_l0"); splitLstmWeights(weightMap, lname + ".bias_hh_l0"); splitLstmWeights(weightMap, lname + ".weight_ih_l0_reverse"); splitLstmWeights(weightMap, lname + ".weight_hh_l0_reverse"); splitLstmWeights(weightMap, lname + ".bias_ih_l0_reverse"); splitLstmWeights(weightMap, lname + ".bias_hh_l0_reverse"); Dims dims = input.getDimensions(); std::cout << "lstm input shape: " << dims.nbDims << " [" << dims.d[0] << " " << dims.d[1] << " " << dims.d[2] << "]"<< std::endl; auto lstm = network->addRNNv2(input, 1, nHidden, dims.d[1], RNNOperation::kLSTM); lstm->setDirection(RNNDirection::kBIDIRECTION); lstm->setWeightsForGate(0, RNNGateType::kINPUT, true, weightMap[lname + ".weight_ih_l00"]); lstm->setWeightsForGate(0, RNNGateType::kFORGET, true, weightMap[lname + ".weight_ih_l01"]); lstm->setWeightsForGate(0, RNNGateType::kCELL, true, weightMap[lname + ".weight_ih_l02"]); lstm->setWeightsForGate(0, RNNGateType::kOUTPUT, true, weightMap[lname + ".weight_ih_l03"]); lstm->setWeightsForGate(0, RNNGateType::kINPUT, false, weightMap[lname + ".weight_hh_l00"]); lstm->setWeightsForGate(0, RNNGateType::kFORGET, false, weightMap[lname + ".weight_hh_l01"]); lstm->setWeightsForGate(0, RNNGateType::kCELL, false, weightMap[lname + ".weight_hh_l02"]); lstm->setWeightsForGate(0, RNNGateType::kOUTPUT, false, weightMap[lname + ".weight_hh_l03"]); lstm->setBiasForGate(0, RNNGateType::kINPUT, true, weightMap[lname + ".bias_ih_l00"]); lstm->setBiasForGate(0, RNNGateType::kFORGET, true, weightMap[lname + ".bias_ih_l01"]); lstm->setBiasForGate(0, RNNGateType::kCELL, true, weightMap[lname + ".bias_ih_l02"]); lstm->setBiasForGate(0, RNNGateType::kOUTPUT, true, weightMap[lname + ".bias_ih_l03"]); lstm->setBiasForGate(0, RNNGateType::kINPUT, false, weightMap[lname + ".bias_hh_l00"]); lstm->setBiasForGate(0, RNNGateType::kFORGET, false, weightMap[lname + ".bias_hh_l01"]); lstm->setBiasForGate(0, RNNGateType::kCELL, false, weightMap[lname + ".bias_hh_l02"]); lstm->setBiasForGate(0, RNNGateType::kOUTPUT, false, weightMap[lname + ".bias_hh_l03"]); lstm->setWeightsForGate(1, RNNGateType::kINPUT, true, weightMap[lname + ".weight_ih_l0_reverse0"]); lstm->setWeightsForGate(1, RNNGateType::kFORGET, true, weightMap[lname + ".weight_ih_l0_reverse1"]); lstm->setWeightsForGate(1, RNNGateType::kCELL, true, weightMap[lname + ".weight_ih_l0_reverse2"]); lstm->setWeightsForGate(1, RNNGateType::kOUTPUT, true, weightMap[lname + ".weight_ih_l0_reverse3"]); lstm->setWeightsForGate(1, RNNGateType::kINPUT, false, weightMap[lname + ".weight_hh_l0_reverse0"]); lstm->setWeightsForGate(1, RNNGateType::kFORGET, false, weightMap[lname + ".weight_hh_l0_reverse1"]); lstm->setWeightsForGate(1, RNNGateType::kCELL, false, weightMap[lname + ".weight_hh_l0_reverse2"]); lstm->setWeightsForGate(1, RNNGateType::kOUTPUT, false, weightMap[lname + ".weight_hh_l0_reverse3"]); lstm->setBiasForGate(1, RNNGateType::kINPUT, true, weightMap[lname + ".bias_ih_l0_reverse0"]); lstm->setBiasForGate(1, RNNGateType::kFORGET, true, weightMap[lname + ".bias_ih_l0_reverse1"]); lstm->setBiasForGate(1, RNNGateType::kCELL, true, weightMap[lname + ".bias_ih_l0_reverse2"]); lstm->setBiasForGate(1, RNNGateType::kOUTPUT, true, weightMap[lname + ".bias_ih_l0_reverse3"]); lstm->setBiasForGate(1, RNNGateType::kINPUT, false, weightMap[lname + ".bias_hh_l0_reverse0"]); lstm->setBiasForGate(1, RNNGateType::kFORGET, false, weightMap[lname + ".bias_hh_l0_reverse1"]); lstm->setBiasForGate(1, RNNGateType::kCELL, false, weightMap[lname + ".bias_hh_l0_reverse2"]); lstm->setBiasForGate(1, RNNGateType::kOUTPUT, false, weightMap[lname + ".bias_hh_l0_reverse3"]); return lstm; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {C, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../crnn.wts"); // cnn auto x = convRelu(network, weightMap, *data, 0); auto p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); p->setStrideNd(DimsHW{2, 2}); x = convRelu(network, weightMap, *p->getOutput(0), 1); p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); p->setStrideNd(DimsHW{2, 2}); x = convRelu(network, weightMap, *p->getOutput(0), 2, true); x = convRelu(network, weightMap, *x->getOutput(0), 3); p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); p->setStrideNd(DimsHW{2, 1}); p->setPaddingNd(DimsHW{0, 1}); x = convRelu(network, weightMap, *p->getOutput(0), 4, true); x = convRelu(network, weightMap, *x->getOutput(0), 5); p = network->addPoolingNd(*x->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); p->setStrideNd(DimsHW{2, 1}); p->setPaddingNd(DimsHW{0, 1}); x = convRelu(network, weightMap, *p->getOutput(0), 6, true); auto sfl = network->addShuffle(*x->getOutput(0)); sfl->setFirstTranspose(Permutation{1, 2, 0}); // rnn auto lstm0 = addLSTM(network, weightMap, *sfl->getOutput(0), 256, "rnn.0.rnn"); auto sfl0 = network->addShuffle(*lstm0->getOutput(0)); sfl0->setReshapeDimensions(Dims4{26, 1, 1, 512}); auto fc0 = network->addFullyConnected(*sfl0->getOutput(0), 256, weightMap["rnn.0.embedding.weight"], weightMap["rnn.0.embedding.bias"]); sfl = network->addShuffle(*fc0->getOutput(0)); sfl->setFirstTranspose(Permutation{2, 3, 0, 1}); sfl->setReshapeDimensions(Dims3{1, 26, 256}); auto lstm1 = addLSTM(network, weightMap, *sfl->getOutput(0), 256, "rnn.1.rnn"); auto sfl1 = network->addShuffle(*lstm1->getOutput(0)); sfl1->setReshapeDimensions(Dims4{26, 1, 1, 512}); auto fc1 = network->addFullyConnected(*sfl1->getOutput(0), 37, weightMap["rnn.1.embedding.weight"], weightMap["rnn.1.embedding.bias"]); Dims dims = fc1->getOutput(0)->getDimensions(); std::cout << "fc1 shape " << dims.d[0] << " " << dims.d[1] << " " << dims.d[2] << std::endl; fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) { // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 1 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("crnn.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { std::ifstream file("crnn.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./crnn -s // serialize model to plan file" << std::endl; std::cerr << "./crnn -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 1 * INPUT_H * INPUT_W]; //for (int i = 0; i < 1 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; assert(engine->getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 1 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); cv::Mat img = cv::imread("demo.png"); if (img.empty()) { std::cerr << "demo.png not found !!!" << std::endl; return -1; } cv::cvtColor(img, img, CV_BGR2GRAY); cv::resize(img, img, cv::Size(INPUT_W, INPUT_H)); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)img.at(i) / 255.0 - 0.5) * 2.0; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, stream, buffers, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector preds; for (int i = 0; i < 26; i++) { int maxj = 0; for (int j = 1; j < 37; j++) { if (prob[37 * i + j] > prob[37 * i + maxj]) maxj = j; } preds.push_back(maxj); } std::cout << "raw: " << strDecode(preds, true) << std::endl; std::cout << "sim: " << strDecode(preds, false) << std::endl; // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: crnn/genwts.py ================================================ import torch from torch.autograd import Variable import utils import models.crnn as crnn import struct model_path = './data/crnn.pth' model = crnn.CRNN(32, 1, 37, 256) if torch.cuda.is_available(): model = model.cuda() print('loading pretrained model from %s' % model_path) model.load_state_dict(torch.load(model_path)) image = torch.ones(1, 1, 32, 100) if torch.cuda.is_available(): image = image.cuda() model.eval() print(model) print('image shape ', image.shape) preds = model(image) f = open("crnn.wts", 'w') f.write("{}\n".format(len(model.state_dict().keys()))) for k,v in model.state_dict().items(): print('key: ', k) print('value: ', v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") ================================================ FILE: crnn/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: csrnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(csrnet) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) # cuda include_directories(/usr/local/cuda/targets/x86_64-linux/include ) link_directories(/usr/local/cuda/targets/x86_64-linux/lib) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) # opencv find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) include_directories(${PROJECT_SOURCE_DIR}/) add_executable(csrnet csrnet.cpp) target_link_libraries(csrnet nvinfer cudart ${OpenCV_LIBS}) ================================================ FILE: csrnet/README.md ================================================ # csrnet The Pytorch implementation is [leeyeehoo/CSRNet-pytorch](https://github.com/leeyeehoo/CSRNet-pytorch). This repo is a TensorRT implementation of CSRNet. paper : [CSRNet: Dilated Convolutional Neural Networks for Understanding the Highly Congested Scenes](https://arxiv.org/abs/1802.10062) Dev environment: - Ubuntu 22.04 - TensorRT 8.6 - OpenCV 4.5.4 - CMake 3.24 - GPU Driver 535.113.01 - CUDA 12.2 - RTX3080 # how to run ```bash 1. generate csrnet engine git clone https://github.com/leeyeehoo/CSRNet-pytorch.git git clone https://github.com/wang-xinyu/tensorrtx.git // copy gen_wts.py to CSRNet-pytorch // generate wts file python gen_wts.py // csrnet wts will be generated in CSRNet-pytorch 2. build csrnet.engine // mv CSRNet-pytorch/csrnet.engine to tensorrtx/csrnet mv CSRNet-pytorch/csrnet.wts tensorrtx/csrnet // build mkdir build cmake .. make sudo ./csrnet -s ./csrnet.wts Loading weights: ./csrnet.wts build engine successfully : ./csrnet.engine // download images https://github.com/wang-xinyu/tensorrtx/assets/46584679/46bc4def-e573-44ae-996d-5d68927c78ff and copy to images sudo ./csrnet -d ./images // output e.g // enqueueV2 time: 0.0323869s // detect time:44ms // people num :22.9101 write_path: ../images/data.jpg ``` # result inference people num: 22.9101

================================================ FILE: csrnet/config.h ================================================ #pragma once const static char *kInputTensorName = "data"; const static char *kOutputTensorName = "prob"; const static char *kEngineFile = "./csrnet.engine"; const static int kBatchSize = 1; const static int MAX_INPUT_SIZE = 1440; // 32x const static int MIN_INPUT_SIZE = 608; const static int OPT_INPUT_W = 1152; const static int OPT_INPUT_H = 640; constexpr static int kMaxInputImageSize = MAX_INPUT_SIZE * MAX_INPUT_SIZE * 3; constexpr static int kMaxOutputProbSize = (MAX_INPUT_SIZE * MAX_INPUT_SIZE) >> 6; ================================================ FILE: csrnet/csrnet.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include #include #include #include #include #include #include #include #include #include #include using namespace nvinfer1; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != 0) { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) static Logger gLogger; static char *kWTSFile = ""; std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } // clang-format off /* CSRNet( (frontend): Sequential( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU(inplace=True) (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU(inplace=True) (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (6): ReLU(inplace=True) (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (8): ReLU(inplace=True) (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (11): ReLU(inplace=True) (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (13): ReLU(inplace=True) (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (15): ReLU(inplace=True) (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (18): ReLU(inplace=True) (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (20): ReLU(inplace=True) (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (22): ReLU(inplace=True) ) (backend): Sequential( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (1): ReLU(inplace=True) (2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (3): ReLU(inplace=True) (4): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (5): ReLU(inplace=True) (6): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (7): ReLU(inplace=True) (8): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (9): ReLU(inplace=True) (10): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2)) (11): ReLU(inplace=True) ) (output_layer): Conv2d(64, 1, kernel_size=(1, 1), stride=(1, 1)) ) */ // clang-format on void doInference(IExecutionContext &context, float *input, float *output, int input_h, int input_w) { const ICudaEngine &engine = context.getEngine(); uint64_t input_size = 3 * input_h * input_w * sizeof(float); uint64_t output_size = ((input_h * input_w) >> 6) * sizeof(float); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void *buffers[2]; // In order to bind the buffers, we need to know the names of the input and // output tensors. Note that indices are guaranteed to be less than // IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(kInputTensorName); const int outputIndex = engine.getBindingIndex(kOutputTensorName); context.setBindingDimensions(inputIndex, Dims4(1, 3, input_h, input_w)); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], input_size)); CHECK(cudaMalloc(&buffers[outputIndex], output_size)); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA // output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, input_size, cudaMemcpyHostToDevice, stream)); auto t1 = std::chrono::high_resolution_clock::now(); context.enqueueV2(buffers, stream, nullptr); std::cout << "enqueueV2 time: " << std::chrono::duration( std::chrono::high_resolution_clock::now() - t1) .count() << "s" << std::endl; CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size, cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt) { // INetworkDefinition *network = builder->createNetworkV2(0U); const auto explicitBatch = 1U << static_cast( NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition *network = builder->createNetworkV2(explicitBatch); ITensor *data = network->addInput(kInputTensorName, dt, Dims4{1, 3, -1, -1}); assert(data); std::map weightMap = loadWeights(kWTSFile); IConvolutionLayer *conv1 = network->addConvolutionNd( *data, 64, DimsHW{3, 3}, weightMap["frontend.0.weight"], weightMap["frontend.0.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{1, 1}); conv1->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu1); auto conv2 = network->addConvolutionNd(*relu1->getOutput(0), 64, DimsHW{3, 3}, weightMap["frontend.2.weight"], weightMap["frontend.2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{1, 1}); conv2->setPaddingNd(DimsHW{1, 1}); auto relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); assert(relu2); auto pool1 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); auto conv3 = network->addConvolutionNd( *pool1->getOutput(0), 128, DimsHW{3, 3}, weightMap["frontend.5.weight"], weightMap["frontend.5.bias"]); assert(conv3); conv3->setStrideNd(DimsHW{1, 1}); conv3->setPaddingNd(DimsHW{1, 1}); auto relu3 = network->addActivation(*conv3->getOutput(0), ActivationType::kRELU); assert(relu3); auto conv4 = network->addConvolutionNd( *relu3->getOutput(0), 128, DimsHW{3, 3}, weightMap["frontend.7.weight"], weightMap["frontend.7.bias"]); assert(conv4); conv4->setStrideNd(DimsHW{1, 1}); conv4->setPaddingNd(DimsHW{1, 1}); auto relu4 = network->addActivation(*conv4->getOutput(0), ActivationType::kRELU); assert(relu4); auto pool2 = network->addPoolingNd(*relu4->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool2); pool2->setStrideNd(DimsHW{2, 2}); auto conv5 = network->addConvolutionNd( *pool2->getOutput(0), 256, DimsHW{3, 3}, weightMap["frontend.10.weight"], weightMap["frontend.10.bias"]); assert(conv5); conv5->setStrideNd(DimsHW{1, 1}); conv5->setPaddingNd(DimsHW{1, 1}); auto relu5 = network->addActivation(*conv5->getOutput(0), ActivationType::kRELU); assert(relu5); auto conv6 = network->addConvolutionNd( *relu5->getOutput(0), 256, DimsHW{3, 3}, weightMap["frontend.12.weight"], weightMap["frontend.12.bias"]); assert(conv6); conv6->setStrideNd(DimsHW{1, 1}); conv6->setPaddingNd(DimsHW{1, 1}); auto relu6 = network->addActivation(*conv6->getOutput(0), ActivationType::kRELU); assert(relu6); auto conv7 = network->addConvolutionNd( *relu6->getOutput(0), 256, DimsHW{3, 3}, weightMap["frontend.14.weight"], weightMap["frontend.14.bias"]); assert(conv7); conv7->setStrideNd(DimsHW{1, 1}); conv7->setPaddingNd(DimsHW{1, 1}); auto relu7 = network->addActivation(*conv7->getOutput(0), ActivationType::kRELU); assert(relu7); auto pool3 = network->addPoolingNd(*relu7->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool3); pool3->setStrideNd(DimsHW{2, 2}); auto conv8 = network->addConvolutionNd( *pool3->getOutput(0), 512, DimsHW{3, 3}, weightMap["frontend.17.weight"], weightMap["frontend.17.bias"]); assert(conv8); conv8->setStrideNd(DimsHW{1, 1}); conv8->setPaddingNd(DimsHW{1, 1}); auto relu8 = network->addActivation(*conv8->getOutput(0), ActivationType::kRELU); assert(relu8); auto conv9 = network->addConvolutionNd( *relu8->getOutput(0), 512, DimsHW{3, 3}, weightMap["frontend.19.weight"], weightMap["frontend.19.bias"]); assert(conv9); conv9->setStrideNd(DimsHW{1, 1}); conv9->setPaddingNd(DimsHW{1, 1}); auto relu9 = network->addActivation(*conv9->getOutput(0), ActivationType::kRELU); assert(relu9); auto conv10 = network->addConvolutionNd( *relu9->getOutput(0), 512, DimsHW{3, 3}, weightMap["frontend.21.weight"], weightMap["frontend.21.bias"]); assert(conv10); conv10->setStrideNd(DimsHW{1, 1}); conv10->setPaddingNd(DimsHW{1, 1}); auto relu10 = network->addActivation(*conv10->getOutput(0), ActivationType::kRELU); assert(relu10); // backend auto conv11 = network->addConvolutionNd( *relu10->getOutput(0), 512, DimsHW{3, 3}, weightMap["backend.0.weight"], weightMap["backend.0.bias"]); assert(conv11); conv11->setPaddingNd(DimsHW{2, 2}); conv11->setStrideNd(DimsHW{1, 1}); conv11->setDilationNd(DimsHW{2, 2}); auto relu11 = network->addActivation(*conv11->getOutput(0), ActivationType::kRELU); assert(relu11); auto conv12 = network->addConvolutionNd( *relu11->getOutput(0), 512, DimsHW{3, 3}, weightMap["backend.2.weight"], weightMap["backend.2.bias"]); assert(conv12); conv12->setPaddingNd(DimsHW{2, 2}); conv12->setStrideNd(DimsHW{1, 1}); conv12->setDilationNd(DimsHW{2, 2}); auto relu12 = network->addActivation(*conv12->getOutput(0), ActivationType::kRELU); assert(relu12); auto conv13 = network->addConvolutionNd( *relu12->getOutput(0), 512, DimsHW{3, 3}, weightMap["backend.4.weight"], weightMap["backend.4.bias"]); assert(conv13); conv13->setPaddingNd(DimsHW{2, 2}); conv13->setStrideNd(DimsHW{1, 1}); conv13->setDilationNd(DimsHW{2, 2}); auto relu13 = network->addActivation(*conv13->getOutput(0), ActivationType::kRELU); assert(relu13); auto conv14 = network->addConvolutionNd( *relu13->getOutput(0), 256, DimsHW{3, 3}, weightMap["backend.6.weight"], weightMap["backend.6.bias"]); assert(conv14); conv14->setPaddingNd(DimsHW{2, 2}); conv14->setStrideNd(DimsHW{1, 1}); conv14->setDilationNd(DimsHW{2, 2}); auto relu14 = network->addActivation(*conv14->getOutput(0), ActivationType::kRELU); assert(relu14); auto conv15 = network->addConvolutionNd( *relu14->getOutput(0), 128, DimsHW{3, 3}, weightMap["backend.8.weight"], weightMap["backend.8.bias"]); assert(conv15); conv15->setPaddingNd(DimsHW{2, 2}); conv15->setStrideNd(DimsHW{1, 1}); conv15->setDilationNd(DimsHW{2, 2}); auto relu15 = network->addActivation(*conv15->getOutput(0), ActivationType::kRELU); assert(relu15); auto conv16 = network->addConvolutionNd( *relu15->getOutput(0), 64, DimsHW{3, 3}, weightMap["backend.10.weight"], weightMap["backend.10.bias"]); assert(conv16); conv16->setPaddingNd(DimsHW{2, 2}); conv16->setStrideNd(DimsHW{1, 1}); conv16->setDilationNd(DimsHW{2, 2}); auto relu16 = network->addActivation(*conv16->getOutput(0), ActivationType::kRELU); assert(relu16); auto conv17 = network->addConvolutionNd( *relu16->getOutput(0), 1, DimsHW{1, 1}, weightMap["output_layer.weight"], weightMap["output_layer.bias"]); assert(conv17); conv17->setStrideNd(DimsHW{1, 1}); conv17->getOutput(0)->setName(kOutputTensorName); network->markOutput(*conv17->getOutput(0)); IOptimizationProfile *profile = builder->createOptimizationProfile(); profile->setDimensions(kInputTensorName, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE)); profile->setDimensions(kInputTensorName, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W)); profile->setDimensions(kInputTensorName, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE)); config->addOptimizationProfile(profile); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 << 20); #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); printf("build engine successfully : %s\n", kEngineFile); // Don't need the network any more network->destroy(); // Release host memory for (auto &mem : weightMap) { free((void *)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream) { // Create builder IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an // engine ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); config->destroy(); builder->destroy(); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent *p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char **argv) { if (argc != 3) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./csrnet -s ./csrnet.wts // serialize model to plan file" << std::endl; std::cerr << "./csrnet -d ../images // deserialize plan file and run inference" << std::endl; return -1; } char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory *modelStream{nullptr}; kWTSFile = argv[2]; APIToModel(kBatchSize, &modelStream); assert(modelStream != nullptr); std::ofstream p(kEngineFile, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file(kEngineFile, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } IRuntime *runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext *context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } std::vector mean_value{0.406, 0.456, 0.485}; // BGR std::vector std_value{0.225, 0.224, 0.229}; int fcount = 0; float *data = new float[kMaxInputImageSize]; float *prob = new float[kMaxOutputProbSize]; for (auto f : file_names) { fcount++; cv::Mat src_img = cv::imread(std::string(argv[2]) + "/" + f); if (src_img.empty()) continue; int i = 0; for (int row = 0; row < src_img.rows; ++row) { uchar *uc_pixel = src_img.data + row * src_img.step; for (int col = 0; col < src_img.cols; ++col) { data[i] = (uc_pixel[2] / 255.0 - mean_value[2]) / std_value[2]; data[i + src_img.rows * src_img.cols] = (uc_pixel[1] / 255.0 - mean_value[1]) / std_value[1]; data[i + 2 * src_img.rows * src_img.cols] = (uc_pixel[0] / 255.0 - mean_value[0]) / std_value[0]; uc_pixel += 3; ++i; } } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, src_img.rows, src_img.cols); auto end = std::chrono::system_clock::now(); std::cout << "detect time:" << std::chrono::duration_cast(end - start) .count() << "ms" << std::endl; float num = std::accumulate( prob, prob + ((src_img.rows * src_img.cols) >> 6), 0.0f); cv::Mat densityMap(src_img.rows >> 3, src_img.cols >> 3, CV_32FC1, (void *)prob); cv::Mat densityMapScaled; cv::normalize(densityMap, densityMapScaled, 0, 255, cv::NORM_MINMAX, CV_8UC1); cv::Mat densityColorMap; cv::applyColorMap(densityMapScaled, densityColorMap, cv::COLORMAP_VIRIDIS); cv::resize(densityColorMap, densityColorMap, src_img.size()); cv::addWeighted(densityColorMap, 0.5, src_img, 0.5, 0, src_img); // write to jpg cv::putText(src_img, std::string("people num: ") + std::to_string(num), cv::Point(10, 50), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255), 1); std::string write_path = std::string(argv[2]) + "result_" + f; std::cout << "people num :" << num << " write_path: " << write_path << std::endl; cv::imwrite(write_path, src_img); } delete[] data; delete[] prob; return 0; } ================================================ FILE: csrnet/gen_wts.py ================================================ from torch.nn.modules import module from model import CSRNet import torch import os import struct save_path = os.path.join(os.path.dirname( __file__), "output", os.path.basename(__file__).split('.')[0]) os.makedirs(save_path, exist_ok=True) wts_file = os.path.join(save_path, "csrnet.wts") # load model model_path = "partBmodel_best.pth.tar" model = CSRNet() checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # save to wts print(f'Writing into {wts_file}') with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: csrnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include "macros.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream &stream, const std::string &prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer &&other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered // part of the output sequence std::streambuf::pptr() gives a pointer to the // current position of the output sequence if the pointer to the beginning // is not equal to the pointer to the current position, call putOutput() to // log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents // into the stream, resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm *tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into // the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream &mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before //! std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream &stream, const std::string &prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when //! logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the //! LogStreamConsumerBuffer member field in LogStreamConsumer and then the //! address of the buffer is passed to std::ostream. This is necessary to //! prevent the address of an uninitialized buffer from being passed to //! std::ostream. Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level //! severity. //! Reportable severity determines if the messages are severe enough to be //! logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer &&other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream &severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and //! samples to log information to the console, and supports logging two types of //! messages: //! //! - Debugging messages with an associated severity (info, warning, error, or //! internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to //! emitting directly to stdout/stderr is that the logic for controlling the //! verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results //! to a file in some standard format (for example, JUnit XML), and providing //! additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits //! directly from the nvinfer1::ILogger interface, which is problematic since //! there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to //! access the ILogger) we can refactor the class to eliminate the inheritance //! and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger //! associated with this Logger \return The nvinfer1::ILogger associated with //! this Logger //! //! TODO Once all samples are updated to use this method to register the //! logger with TensorRT, we can eliminate the inheritance of Logger from //! ILogger //! nvinfer1::ILogger &getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will //! eventually go away once we eliminate the inheritance from //! nvinfer1::ILogger //! void log(Severity severity, const char *msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of //! this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print //! test results. The sample must call Logger::defineTest() in order to obtain //! a TestAtom that can be used with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom &&) = default; private: friend class Logger; TestAtom(bool started, const std::string &name, const std::string &cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting //! with //! "TensorRT" and containing dot-separated strings //! containing the characters [A-Za-z0-9_]. For example, //! "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string &name, const std::string &cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an //! array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string &name, int argc, char const *const *argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom &testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of //! TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom &testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom &testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom &testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom &testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom &testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the //! given severity //! static const char *severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message //! with the given result //! static const char *testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the //! given severity //! static std::ostream &severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom &testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const *const *argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages //! of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages //! of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages //! of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages //! of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages //! of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: csrnet/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: dbnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(dbnet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) aux_source_directory(. DIRSRCS) # clipper include_directories(./ ./clipper) add_subdirectory(clipper) add_executable(dbnet ${DIRSRCS}) target_link_libraries(dbnet clipper) target_link_libraries(dbnet nvinfer) target_link_libraries(dbnet cudart) target_link_libraries(dbnet ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: dbnet/README.md ================================================ # DBNet The Pytorch implementation is [DBNet](https://github.com/BaofengZan/DBNet.pytorch).

## How to Run * 1. generate `.wts` Download code and model from [DBNet](https://github.com/BaofengZan/DBNet.pytorch) and config your environments. Go to file`tools/predict.py`, set `--save_wts` as `True`, then run, the `DBNet.wts` will be generated. Onnx can also be exported, just need to set `--onnx` as `True`. * 2. cmake and make ``` mkdir build cd build cmake .. make cp /your_wts_path/DBNet.wts . sudo ./dbnet -s // serialize model to plan file i.e. 'DBNet.engine' sudo ./dbnet -d ./test_imgs // deserialize plan file and run inference, all images in test_imgs folder will be processed. ``` ## For windows https://github.com/BaofengZan/DBNet-TensorRT ## Todo - [x] 1. In `common.hpp`, the following two functions can be merged. ```c++ ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname, bool bias = true) ``` ```c++ ILayer* convBnLeaky2(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname, bool bias = true) ``` - [x] 2. The postprocess method here should be optimized, which is a little different from pytorch side. - [x] 3. The input image here is resized to `640 x 640` directly, while the pytorch side is using `letterbox` method. ================================================ FILE: dbnet/clipper/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) aux_source_directory(. DIR_CLIPPER_SRCS) add_library(clipper ${DIR_CLIPPER_SRCS}) ================================================ FILE: dbnet/clipper/clipper.cpp ================================================ /******************************************************************************* * * * Author : Angus Johnson * * Version : 6.4.2 * * Date : 27 February 2017 * * Website : http://www.angusj.com * * Copyright : Angus Johnson 2010-2017 * * * * License: * * Use, modification & distribution is subject to Boost Software License Ver 1. * * http://www.boost.org/LICENSE_1_0.txt * * * * Attributions: * * The code in this library is an extension of Bala Vatti's clipping algorithm: * * "A generic solution to polygon clipping" * * Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63. * * http://portal.acm.org/citation.cfm?id=129906 * * * * Computer graphics and geometric modeling: implementation and algorithms * * By Max K. Agoston * * Springer; 1 edition (January 4, 2005) * * http://books.google.com/books?q=vatti+clipping+agoston * * * * See also: * * "Polygon Offsetting by Computing Winding Numbers" * * Paper no. DETC2005-85513 pp. 565-575 * * ASME 2005 International Design Engineering Technical Conferences * * and Computers and Information in Engineering Conference (IDETC/CIE2005) * * September 24-28, 2005 , Long Beach, California, USA * * http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf * * * *******************************************************************************/ /******************************************************************************* * * * This is a translation of the Delphi Clipper library and the naming style * * used has retained a Delphi flavour. * * * *******************************************************************************/ #include "clipper.hpp" #include #include #include #include #include #include #include #include namespace ClipperLib { static double const pi = 3.141592653589793238; static double const two_pi = pi *2; static double const def_arc_tolerance = 0.25; enum Direction { dRightToLeft, dLeftToRight }; static int const Unassigned = -1; //edge not currently 'owning' a solution static int const Skip = -2; //edge that would otherwise close a path #define HORIZONTAL (-1.0E+40) #define TOLERANCE (1.0e-20) #define NEAR_ZERO(val) (((val) > -TOLERANCE) && ((val) < TOLERANCE)) struct TEdge { IntPoint Bot; IntPoint Curr; //current (updated for every new scanbeam) IntPoint Top; double Dx; PolyType PolyTyp; EdgeSide Side; //side only refers to current side of solution poly int WindDelta; //1 or -1 depending on winding direction int WindCnt; int WindCnt2; //winding count of the opposite polytype int OutIdx; TEdge *Next; TEdge *Prev; TEdge *NextInLML; TEdge *NextInAEL; TEdge *PrevInAEL; TEdge *NextInSEL; TEdge *PrevInSEL; }; struct IntersectNode { TEdge *Edge1; TEdge *Edge2; IntPoint Pt; }; struct LocalMinimum { cInt Y; TEdge *LeftBound; TEdge *RightBound; }; struct OutPt; //OutRec: contains a path in the clipping solution. Edges in the AEL will //carry a pointer to an OutRec when they are part of the clipping solution. struct OutRec { int Idx; bool IsHole; bool IsOpen; OutRec *FirstLeft; //see comments in clipper.pas PolyNode *PolyNd; OutPt *Pts; OutPt *BottomPt; }; struct OutPt { int Idx; IntPoint Pt; OutPt *Next; OutPt *Prev; }; struct Join { OutPt *OutPt1; OutPt *OutPt2; IntPoint OffPt; }; struct LocMinSorter { inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2) { return locMin2.Y < locMin1.Y; } }; //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ inline cInt Round(double val) { if ((val < 0)) return static_cast(val - 0.5); else return static_cast(val + 0.5); } //------------------------------------------------------------------------------ inline cInt Abs(cInt val) { return val < 0 ? -val : val; } //------------------------------------------------------------------------------ // PolyTree methods ... //------------------------------------------------------------------------------ void PolyTree::Clear() { for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i) delete AllNodes[i]; AllNodes.resize(0); Childs.resize(0); } //------------------------------------------------------------------------------ PolyNode* PolyTree::GetFirst() const { if (!Childs.empty()) return Childs[0]; else return 0; } //------------------------------------------------------------------------------ int PolyTree::Total() const { int result = (int)AllNodes.size(); //with negative offsets, ignore the hidden outer polygon ... if (result > 0 && Childs[0] != AllNodes[0]) result--; return result; } //------------------------------------------------------------------------------ // PolyNode methods ... //------------------------------------------------------------------------------ PolyNode::PolyNode(): Parent(0), Index(0), m_IsOpen(false) { } //------------------------------------------------------------------------------ int PolyNode::ChildCount() const { return (int)Childs.size(); } //------------------------------------------------------------------------------ void PolyNode::AddChild(PolyNode& child) { unsigned cnt = (unsigned)Childs.size(); Childs.push_back(&child); child.Parent = this; child.Index = cnt; } //------------------------------------------------------------------------------ PolyNode* PolyNode::GetNext() const { if (!Childs.empty()) return Childs[0]; else return GetNextSiblingUp(); } //------------------------------------------------------------------------------ PolyNode* PolyNode::GetNextSiblingUp() const { if (!Parent) //protects against PolyTree.GetNextSiblingUp() return 0; else if (Index == Parent->Childs.size() - 1) return Parent->GetNextSiblingUp(); else return Parent->Childs[Index + 1]; } //------------------------------------------------------------------------------ bool PolyNode::IsHole() const { bool result = true; PolyNode* node = Parent; while (node) { result = !result; node = node->Parent; } return result; } //------------------------------------------------------------------------------ bool PolyNode::IsOpen() const { return m_IsOpen; } //------------------------------------------------------------------------------ #ifndef use_int32 //------------------------------------------------------------------------------ // Int128 class (enables safe math on signed 64bit integers) // eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1 // Int128 val2((long64)9223372036854775807); // Int128 val3 = val1 * val2; // val3.AsString => "85070591730234615847396907784232501249" (8.5e+37) //------------------------------------------------------------------------------ class Int128 { public: ulong64 lo; long64 hi; Int128(long64 _lo = 0) { lo = (ulong64)_lo; if (_lo < 0) hi = -1; else hi = 0; } Int128(const Int128 &val): lo(val.lo), hi(val.hi){} Int128(const long64& _hi, const ulong64& _lo): lo(_lo), hi(_hi){} Int128& operator = (const long64 &val) { lo = (ulong64)val; if (val < 0) hi = -1; else hi = 0; return *this; } bool operator == (const Int128 &val) const {return (hi == val.hi && lo == val.lo);} bool operator != (const Int128 &val) const { return !(*this == val);} bool operator > (const Int128 &val) const { if (hi != val.hi) return hi > val.hi; else return lo > val.lo; } bool operator < (const Int128 &val) const { if (hi != val.hi) return hi < val.hi; else return lo < val.lo; } bool operator >= (const Int128 &val) const { return !(*this < val);} bool operator <= (const Int128 &val) const { return !(*this > val);} Int128& operator += (const Int128 &rhs) { hi += rhs.hi; lo += rhs.lo; if (lo < rhs.lo) hi++; return *this; } Int128 operator + (const Int128 &rhs) const { Int128 result(*this); result+= rhs; return result; } Int128& operator -= (const Int128 &rhs) { *this += -rhs; return *this; } Int128 operator - (const Int128 &rhs) const { Int128 result(*this); result -= rhs; return result; } Int128 operator-() const //unary negation { if (lo == 0) return Int128(-hi, 0); else return Int128(~hi, ~lo + 1); } operator double() const { const double shift64 = 18446744073709551616.0; //2^64 if (hi < 0) { if (lo == 0) return (double)hi * shift64; else return -(double)(~lo + ~hi * shift64); } else return (double)(lo + hi * shift64); } }; //------------------------------------------------------------------------------ Int128 Int128Mul (long64 lhs, long64 rhs) { bool negate = (lhs < 0) != (rhs < 0); if (lhs < 0) lhs = -lhs; ulong64 int1Hi = ulong64(lhs) >> 32; ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF); if (rhs < 0) rhs = -rhs; ulong64 int2Hi = ulong64(rhs) >> 32; ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF); //nb: see comments in clipper.pas ulong64 a = int1Hi * int2Hi; ulong64 b = int1Lo * int2Lo; ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi; Int128 tmp; tmp.hi = long64(a + (c >> 32)); tmp.lo = long64(c << 32); tmp.lo += long64(b); if (tmp.lo < b) tmp.hi++; if (negate) tmp = -tmp; return tmp; }; #endif //------------------------------------------------------------------------------ // Miscellaneous global functions //------------------------------------------------------------------------------ bool Orientation(const Path &poly) { return Area(poly) >= 0; } //------------------------------------------------------------------------------ double Area(const Path &poly) { int size = (int)poly.size(); if (size < 3) return 0; double a = 0; for (int i = 0, j = size -1; i < size; ++i) { a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y); j = i; } return -a * 0.5; } //------------------------------------------------------------------------------ double Area(const OutPt *op) { const OutPt *startOp = op; if (!op) return 0; double a = 0; do { a += (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y); op = op->Next; } while (op != startOp); return a * 0.5; } //------------------------------------------------------------------------------ double Area(const OutRec &outRec) { return Area(outRec.Pts); } //------------------------------------------------------------------------------ bool PointIsVertex(const IntPoint &Pt, OutPt *pp) { OutPt *pp2 = pp; do { if (pp2->Pt == Pt) return true; pp2 = pp2->Next; } while (pp2 != pp); return false; } //------------------------------------------------------------------------------ //See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos //http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf int PointInPolygon(const IntPoint &pt, const Path &path) { //returns 0 if false, +1 if true, -1 if pt ON polygon boundary int result = 0; size_t cnt = path.size(); if (cnt < 3) return 0; IntPoint ip = path[0]; for(size_t i = 1; i <= cnt; ++i) { IntPoint ipNext = (i == cnt ? path[0] : path[i]); if (ipNext.Y == pt.Y) { if ((ipNext.X == pt.X) || (ip.Y == pt.Y && ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1; } if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y)) { if (ip.X >= pt.X) { if (ipNext.X > pt.X) result = 1 - result; else { double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) - (double)(ipNext.X - pt.X) * (ip.Y - pt.Y); if (!d) return -1; if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result; } } else { if (ipNext.X > pt.X) { double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) - (double)(ipNext.X - pt.X) * (ip.Y - pt.Y); if (!d) return -1; if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result; } } } ip = ipNext; } return result; } //------------------------------------------------------------------------------ int PointInPolygon (const IntPoint &pt, OutPt *op) { //returns 0 if false, +1 if true, -1 if pt ON polygon boundary int result = 0; OutPt* startOp = op; for(;;) { if (op->Next->Pt.Y == pt.Y) { if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y && ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1; } if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y)) { if (op->Pt.X >= pt.X) { if (op->Next->Pt.X > pt.X) result = 1 - result; else { double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) - (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y); if (!d) return -1; if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result; } } else { if (op->Next->Pt.X > pt.X) { double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) - (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y); if (!d) return -1; if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result; } } } op = op->Next; if (startOp == op) break; } return result; } //------------------------------------------------------------------------------ bool Poly2ContainsPoly1(OutPt *OutPt1, OutPt *OutPt2) { OutPt* op = OutPt1; do { //nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon int res = PointInPolygon(op->Pt, OutPt2); if (res >= 0) return res > 0; op = op->Next; } while (op != OutPt1); return true; } //---------------------------------------------------------------------- bool SlopesEqual(const TEdge &e1, const TEdge &e2, bool UseFullInt64Range) { #ifndef use_int32 if (UseFullInt64Range) return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) == Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y); else #endif return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) == (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y); } //------------------------------------------------------------------------------ bool SlopesEqual(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3, bool UseFullInt64Range) { #ifndef use_int32 if (UseFullInt64Range) return Int128Mul(pt1.Y-pt2.Y, pt2.X-pt3.X) == Int128Mul(pt1.X-pt2.X, pt2.Y-pt3.Y); else #endif return (pt1.Y-pt2.Y)*(pt2.X-pt3.X) == (pt1.X-pt2.X)*(pt2.Y-pt3.Y); } //------------------------------------------------------------------------------ bool SlopesEqual(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range) { #ifndef use_int32 if (UseFullInt64Range) return Int128Mul(pt1.Y-pt2.Y, pt3.X-pt4.X) == Int128Mul(pt1.X-pt2.X, pt3.Y-pt4.Y); else #endif return (pt1.Y-pt2.Y)*(pt3.X-pt4.X) == (pt1.X-pt2.X)*(pt3.Y-pt4.Y); } //------------------------------------------------------------------------------ inline bool IsHorizontal(TEdge &e) { return e.Dx == HORIZONTAL; } //------------------------------------------------------------------------------ inline double GetDx(const IntPoint pt1, const IntPoint pt2) { return (pt1.Y == pt2.Y) ? HORIZONTAL : (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y); } //--------------------------------------------------------------------------- inline void SetDx(TEdge &e) { cInt dy = (e.Top.Y - e.Bot.Y); if (dy == 0) e.Dx = HORIZONTAL; else e.Dx = (double)(e.Top.X - e.Bot.X) / dy; } //--------------------------------------------------------------------------- inline void SwapSides(TEdge &Edge1, TEdge &Edge2) { EdgeSide Side = Edge1.Side; Edge1.Side = Edge2.Side; Edge2.Side = Side; } //------------------------------------------------------------------------------ inline void SwapPolyIndexes(TEdge &Edge1, TEdge &Edge2) { int OutIdx = Edge1.OutIdx; Edge1.OutIdx = Edge2.OutIdx; Edge2.OutIdx = OutIdx; } //------------------------------------------------------------------------------ inline cInt TopX(TEdge &edge, const cInt currentY) { return ( currentY == edge.Top.Y ) ? edge.Top.X : edge.Bot.X + Round(edge.Dx *(currentY - edge.Bot.Y)); } //------------------------------------------------------------------------------ void IntersectPoint(TEdge &Edge1, TEdge &Edge2, IntPoint &ip) { #ifdef use_xyz ip.Z = 0; #endif double b1, b2; if (Edge1.Dx == Edge2.Dx) { ip.Y = Edge1.Curr.Y; ip.X = TopX(Edge1, ip.Y); return; } else if (Edge1.Dx == 0) { ip.X = Edge1.Bot.X; if (IsHorizontal(Edge2)) ip.Y = Edge2.Bot.Y; else { b2 = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx); ip.Y = Round(ip.X / Edge2.Dx + b2); } } else if (Edge2.Dx == 0) { ip.X = Edge2.Bot.X; if (IsHorizontal(Edge1)) ip.Y = Edge1.Bot.Y; else { b1 = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx); ip.Y = Round(ip.X / Edge1.Dx + b1); } } else { b1 = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx; b2 = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx; double q = (b2-b1) / (Edge1.Dx - Edge2.Dx); ip.Y = Round(q); if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx)) ip.X = Round(Edge1.Dx * q + b1); else ip.X = Round(Edge2.Dx * q + b2); } if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y) { if (Edge1.Top.Y > Edge2.Top.Y) ip.Y = Edge1.Top.Y; else ip.Y = Edge2.Top.Y; if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx)) ip.X = TopX(Edge1, ip.Y); else ip.X = TopX(Edge2, ip.Y); } //finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ... if (ip.Y > Edge1.Curr.Y) { ip.Y = Edge1.Curr.Y; //use the more vertical edge to derive X ... if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx)) ip.X = TopX(Edge2, ip.Y); else ip.X = TopX(Edge1, ip.Y); } } //------------------------------------------------------------------------------ void ReversePolyPtLinks(OutPt *pp) { if (!pp) return; OutPt *pp1, *pp2; pp1 = pp; do { pp2 = pp1->Next; pp1->Next = pp1->Prev; pp1->Prev = pp2; pp1 = pp2; } while( pp1 != pp ); } //------------------------------------------------------------------------------ void DisposeOutPts(OutPt*& pp) { if (pp == 0) return; pp->Prev->Next = 0; while( pp ) { OutPt *tmpPp = pp; pp = pp->Next; delete tmpPp; } } //------------------------------------------------------------------------------ inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt) { std::memset(e, 0, sizeof(TEdge)); e->Next = eNext; e->Prev = ePrev; e->Curr = Pt; e->OutIdx = Unassigned; } //------------------------------------------------------------------------------ void InitEdge2(TEdge& e, PolyType Pt) { if (e.Curr.Y >= e.Next->Curr.Y) { e.Bot = e.Curr; e.Top = e.Next->Curr; } else { e.Top = e.Curr; e.Bot = e.Next->Curr; } SetDx(e); e.PolyTyp = Pt; } //------------------------------------------------------------------------------ TEdge* RemoveEdge(TEdge* e) { //removes e from double_linked_list (but without removing from memory) e->Prev->Next = e->Next; e->Next->Prev = e->Prev; TEdge* result = e->Next; e->Prev = 0; //flag as removed (see ClipperBase.Clear) return result; } //------------------------------------------------------------------------------ inline void ReverseHorizontal(TEdge &e) { //swap horizontal edges' Top and Bottom x's so they follow the natural //progression of the bounds - ie so their xbots will align with the //adjoining lower edge. [Helpful in the ProcessHorizontal() method.] std::swap(e.Top.X, e.Bot.X); #ifdef use_xyz std::swap(e.Top.Z, e.Bot.Z); #endif } //------------------------------------------------------------------------------ void SwapPoints(IntPoint &pt1, IntPoint &pt2) { IntPoint tmp = pt1; pt1 = pt2; pt2 = tmp; } //------------------------------------------------------------------------------ bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a, IntPoint pt2b, IntPoint &pt1, IntPoint &pt2) { //precondition: segments are Collinear. if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y)) { if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b); if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b); if (pt1a.X > pt2a.X) pt1 = pt1a; else pt1 = pt2a; if (pt1b.X < pt2b.X) pt2 = pt1b; else pt2 = pt2b; return pt1.X < pt2.X; } else { if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b); if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b); if (pt1a.Y < pt2a.Y) pt1 = pt1a; else pt1 = pt2a; if (pt1b.Y > pt2b.Y) pt2 = pt1b; else pt2 = pt2b; return pt1.Y > pt2.Y; } } //------------------------------------------------------------------------------ bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2) { OutPt *p = btmPt1->Prev; while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev; double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt)); p = btmPt1->Next; while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next; double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt)); p = btmPt2->Prev; while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev; double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt)); p = btmPt2->Next; while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next; double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt)); if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) && std::min(dx1p, dx1n) == std::min(dx2p, dx2n)) return Area(btmPt1) > 0; //if otherwise identical use orientation else return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n); } //------------------------------------------------------------------------------ OutPt* GetBottomPt(OutPt *pp) { OutPt* dups = 0; OutPt* p = pp->Next; while (p != pp) { if (p->Pt.Y > pp->Pt.Y) { pp = p; dups = 0; } else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X) { if (p->Pt.X < pp->Pt.X) { dups = 0; pp = p; } else { if (p->Next != pp && p->Prev != pp) dups = p; } } p = p->Next; } if (dups) { //there appears to be at least 2 vertices at BottomPt so ... while (dups != p) { if (!FirstIsBottomPt(p, dups)) pp = dups; dups = dups->Next; while (dups->Pt != pp->Pt) dups = dups->Next; } } return pp; } //------------------------------------------------------------------------------ bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3) { if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2)) return false; else if (pt1.X != pt3.X) return (pt2.X > pt1.X) == (pt2.X < pt3.X); else return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y); } //------------------------------------------------------------------------------ bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b) { if (seg1a > seg1b) std::swap(seg1a, seg1b); if (seg2a > seg2b) std::swap(seg2a, seg2b); return (seg1a < seg2b) && (seg2a < seg1b); } //------------------------------------------------------------------------------ // ClipperBase class methods ... //------------------------------------------------------------------------------ ClipperBase::ClipperBase() //constructor { m_CurrentLM = m_MinimaList.begin(); //begin() == end() here m_UseFullRange = false; } //------------------------------------------------------------------------------ ClipperBase::~ClipperBase() //destructor { Clear(); } //------------------------------------------------------------------------------ void RangeTest(const IntPoint& Pt, bool& useFullRange) { if (useFullRange) { if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange) throw clipperException("Coordinate outside allowed range"); } else if (Pt.X > loRange|| Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange) { useFullRange = true; RangeTest(Pt, useFullRange); } } //------------------------------------------------------------------------------ TEdge* FindNextLocMin(TEdge* E) { for (;;) { while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next; if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break; while (IsHorizontal(*E->Prev)) E = E->Prev; TEdge* E2 = E; while (IsHorizontal(*E)) E = E->Next; if (E->Top.Y == E->Prev->Bot.Y) continue; //ie just an intermediate horz. if (E2->Prev->Bot.X < E->Bot.X) E = E2; break; } return E; } //------------------------------------------------------------------------------ TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward) { TEdge *Result = E; TEdge *Horz = 0; if (E->OutIdx == Skip) { //if edges still remain in the current bound beyond the skip edge then //create another LocMin and call ProcessBound once more if (NextIsForward) { while (E->Top.Y == E->Next->Bot.Y) E = E->Next; //don't include top horizontals when parsing a bound a second time, //they will be contained in the opposite bound ... while (E != Result && IsHorizontal(*E)) E = E->Prev; } else { while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev; while (E != Result && IsHorizontal(*E)) E = E->Next; } if (E == Result) { if (NextIsForward) Result = E->Next; else Result = E->Prev; } else { //there are more edges in the bound beyond result starting with E if (NextIsForward) E = Result->Next; else E = Result->Prev; MinimaList::value_type locMin; locMin.Y = E->Bot.Y; locMin.LeftBound = 0; locMin.RightBound = E; E->WindDelta = 0; Result = ProcessBound(E, NextIsForward); m_MinimaList.push_back(locMin); } return Result; } TEdge *EStart; if (IsHorizontal(*E)) { //We need to be careful with open paths because this may not be a //true local minima (ie E may be following a skip edge). //Also, consecutive horz. edges may start heading left before going right. if (NextIsForward) EStart = E->Prev; else EStart = E->Next; if (IsHorizontal(*EStart)) //ie an adjoining horizontal skip edge { if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X) ReverseHorizontal(*E); } else if (EStart->Bot.X != E->Bot.X) ReverseHorizontal(*E); } EStart = E; if (NextIsForward) { while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip) Result = Result->Next; if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip) { //nb: at the top of a bound, horizontals are added to the bound //only when the preceding edge attaches to the horizontal's left vertex //unless a Skip edge is encountered when that becomes the top divide Horz = Result; while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev; if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev; } while (E != Result) { E->NextInLML = E->Next; if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E); E = E->Next; } if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E); Result = Result->Next; //move to the edge just beyond current bound } else { while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip) Result = Result->Prev; if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip) { Horz = Result; while (IsHorizontal(*Horz->Next)) Horz = Horz->Next; if (Horz->Next->Top.X == Result->Prev->Top.X || Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next; } while (E != Result) { E->NextInLML = E->Prev; if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X) ReverseHorizontal(*E); E = E->Prev; } if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X) ReverseHorizontal(*E); Result = Result->Prev; //move to the edge just beyond current bound } return Result; } //------------------------------------------------------------------------------ bool ClipperBase::AddPath(const Path &pg, PolyType PolyTyp, bool Closed) { #ifdef use_lines if (!Closed && PolyTyp == ptClip) throw clipperException("AddPath: Open paths must be subject."); #else if (!Closed) throw clipperException("AddPath: Open paths have been disabled."); #endif int highI = (int)pg.size() -1; if (Closed) while (highI > 0 && (pg[highI] == pg[0])) --highI; while (highI > 0 && (pg[highI] == pg[highI -1])) --highI; if ((Closed && highI < 2) || (!Closed && highI < 1)) return false; //create a new edge array ... TEdge *edges = new TEdge [highI +1]; bool IsFlat = true; //1. Basic (first) edge initialization ... try { edges[1].Curr = pg[1]; RangeTest(pg[0], m_UseFullRange); RangeTest(pg[highI], m_UseFullRange); InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]); InitEdge(&edges[highI], &edges[0], &edges[highI-1], pg[highI]); for (int i = highI - 1; i >= 1; --i) { RangeTest(pg[i], m_UseFullRange); InitEdge(&edges[i], &edges[i+1], &edges[i-1], pg[i]); } } catch(...) { delete [] edges; throw; //range test fails } TEdge *eStart = &edges[0]; //2. Remove duplicate vertices, and (when closed) collinear edges ... TEdge *E = eStart, *eLoopStop = eStart; for (;;) { //nb: allows matching start and end points when not Closed ... if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart)) { if (E == E->Next) break; if (E == eStart) eStart = E->Next; E = RemoveEdge(E); eLoopStop = E; continue; } if (E->Prev == E->Next) break; //only two vertices else if (Closed && SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) && (!m_PreserveCollinear || !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr))) { //Collinear edges are allowed for open paths but in closed paths //the default is to merge adjacent collinear edges into a single edge. //However, if the PreserveCollinear property is enabled, only overlapping //collinear edges (ie spikes) will be removed from closed paths. if (E == eStart) eStart = E->Next; E = RemoveEdge(E); E = E->Prev; eLoopStop = E; continue; } E = E->Next; if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break; } if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next))) { delete [] edges; return false; } if (!Closed) { m_HasOpenPaths = true; eStart->Prev->OutIdx = Skip; } //3. Do second stage of edge initialization ... E = eStart; do { InitEdge2(*E, PolyTyp); E = E->Next; if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false; } while (E != eStart); //4. Finally, add edge bounds to LocalMinima list ... //Totally flat paths must be handled differently when adding them //to LocalMinima list to avoid endless loops etc ... if (IsFlat) { if (Closed) { delete [] edges; return false; } E->Prev->OutIdx = Skip; MinimaList::value_type locMin; locMin.Y = E->Bot.Y; locMin.LeftBound = 0; locMin.RightBound = E; locMin.RightBound->Side = esRight; locMin.RightBound->WindDelta = 0; for (;;) { if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E); if (E->Next->OutIdx == Skip) break; E->NextInLML = E->Next; E = E->Next; } m_MinimaList.push_back(locMin); m_edges.push_back(edges); return true; } m_edges.push_back(edges); bool leftBoundIsForward; TEdge* EMin = 0; //workaround to avoid an endless loop in the while loop below when //open paths have matching start and end points ... if (E->Prev->Bot == E->Prev->Top) E = E->Next; for (;;) { E = FindNextLocMin(E); if (E == EMin) break; else if (!EMin) EMin = E; //E and E.Prev now share a local minima (left aligned if horizontal). //Compare their slopes to find which starts which bound ... MinimaList::value_type locMin; locMin.Y = E->Bot.Y; if (E->Dx < E->Prev->Dx) { locMin.LeftBound = E->Prev; locMin.RightBound = E; leftBoundIsForward = false; //Q.nextInLML = Q.prev } else { locMin.LeftBound = E; locMin.RightBound = E->Prev; leftBoundIsForward = true; //Q.nextInLML = Q.next } if (!Closed) locMin.LeftBound->WindDelta = 0; else if (locMin.LeftBound->Next == locMin.RightBound) locMin.LeftBound->WindDelta = -1; else locMin.LeftBound->WindDelta = 1; locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta; E = ProcessBound(locMin.LeftBound, leftBoundIsForward); if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward); TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward); if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward); if (locMin.LeftBound->OutIdx == Skip) locMin.LeftBound = 0; else if (locMin.RightBound->OutIdx == Skip) locMin.RightBound = 0; m_MinimaList.push_back(locMin); if (!leftBoundIsForward) E = E2; } return true; } //------------------------------------------------------------------------------ bool ClipperBase::AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed) { bool result = false; for (Paths::size_type i = 0; i < ppg.size(); ++i) if (AddPath(ppg[i], PolyTyp, Closed)) result = true; return result; } //------------------------------------------------------------------------------ void ClipperBase::Clear() { DisposeLocalMinimaList(); for (EdgeList::size_type i = 0; i < m_edges.size(); ++i) { TEdge* edges = m_edges[i]; delete [] edges; } m_edges.clear(); m_UseFullRange = false; m_HasOpenPaths = false; } //------------------------------------------------------------------------------ void ClipperBase::Reset() { m_CurrentLM = m_MinimaList.begin(); if (m_CurrentLM == m_MinimaList.end()) return; //ie nothing to process std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter()); m_Scanbeam = ScanbeamList(); //clears/resets priority_queue //reset all edges ... for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm) { InsertScanbeam(lm->Y); TEdge* e = lm->LeftBound; if (e) { e->Curr = e->Bot; e->Side = esLeft; e->OutIdx = Unassigned; } e = lm->RightBound; if (e) { e->Curr = e->Bot; e->Side = esRight; e->OutIdx = Unassigned; } } m_ActiveEdges = 0; m_CurrentLM = m_MinimaList.begin(); } //------------------------------------------------------------------------------ void ClipperBase::DisposeLocalMinimaList() { m_MinimaList.clear(); m_CurrentLM = m_MinimaList.begin(); } //------------------------------------------------------------------------------ bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum *&locMin) { if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false; locMin = &(*m_CurrentLM); ++m_CurrentLM; return true; } //------------------------------------------------------------------------------ IntRect ClipperBase::GetBounds() { IntRect result; MinimaList::iterator lm = m_MinimaList.begin(); if (lm == m_MinimaList.end()) { result.left = result.top = result.right = result.bottom = 0; return result; } result.left = lm->LeftBound->Bot.X; result.top = lm->LeftBound->Bot.Y; result.right = lm->LeftBound->Bot.X; result.bottom = lm->LeftBound->Bot.Y; while (lm != m_MinimaList.end()) { //todo - needs fixing for open paths result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y); TEdge* e = lm->LeftBound; for (;;) { TEdge* bottomE = e; while (e->NextInLML) { if (e->Bot.X < result.left) result.left = e->Bot.X; if (e->Bot.X > result.right) result.right = e->Bot.X; e = e->NextInLML; } result.left = std::min(result.left, e->Bot.X); result.right = std::max(result.right, e->Bot.X); result.left = std::min(result.left, e->Top.X); result.right = std::max(result.right, e->Top.X); result.top = std::min(result.top, e->Top.Y); if (bottomE == lm->LeftBound) e = lm->RightBound; else break; } ++lm; } return result; } //------------------------------------------------------------------------------ void ClipperBase::InsertScanbeam(const cInt Y) { m_Scanbeam.push(Y); } //------------------------------------------------------------------------------ bool ClipperBase::PopScanbeam(cInt &Y) { if (m_Scanbeam.empty()) return false; Y = m_Scanbeam.top(); m_Scanbeam.pop(); while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); } // Pop duplicates. return true; } //------------------------------------------------------------------------------ void ClipperBase::DisposeAllOutRecs(){ for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) DisposeOutRec(i); m_PolyOuts.clear(); } //------------------------------------------------------------------------------ void ClipperBase::DisposeOutRec(PolyOutList::size_type index) { OutRec *outRec = m_PolyOuts[index]; if (outRec->Pts) DisposeOutPts(outRec->Pts); delete outRec; m_PolyOuts[index] = 0; } //------------------------------------------------------------------------------ void ClipperBase::DeleteFromAEL(TEdge *e) { TEdge* AelPrev = e->PrevInAEL; TEdge* AelNext = e->NextInAEL; if (!AelPrev && !AelNext && (e != m_ActiveEdges)) return; //already deleted if (AelPrev) AelPrev->NextInAEL = AelNext; else m_ActiveEdges = AelNext; if (AelNext) AelNext->PrevInAEL = AelPrev; e->NextInAEL = 0; e->PrevInAEL = 0; } //------------------------------------------------------------------------------ OutRec* ClipperBase::CreateOutRec() { OutRec* result = new OutRec; result->IsHole = false; result->IsOpen = false; result->FirstLeft = 0; result->Pts = 0; result->BottomPt = 0; result->PolyNd = 0; m_PolyOuts.push_back(result); result->Idx = (int)m_PolyOuts.size() - 1; return result; } //------------------------------------------------------------------------------ void ClipperBase::SwapPositionsInAEL(TEdge *Edge1, TEdge *Edge2) { //check that one or other edge hasn't already been removed from AEL ... if (Edge1->NextInAEL == Edge1->PrevInAEL || Edge2->NextInAEL == Edge2->PrevInAEL) return; if (Edge1->NextInAEL == Edge2) { TEdge* Next = Edge2->NextInAEL; if (Next) Next->PrevInAEL = Edge1; TEdge* Prev = Edge1->PrevInAEL; if (Prev) Prev->NextInAEL = Edge2; Edge2->PrevInAEL = Prev; Edge2->NextInAEL = Edge1; Edge1->PrevInAEL = Edge2; Edge1->NextInAEL = Next; } else if (Edge2->NextInAEL == Edge1) { TEdge* Next = Edge1->NextInAEL; if (Next) Next->PrevInAEL = Edge2; TEdge* Prev = Edge2->PrevInAEL; if (Prev) Prev->NextInAEL = Edge1; Edge1->PrevInAEL = Prev; Edge1->NextInAEL = Edge2; Edge2->PrevInAEL = Edge1; Edge2->NextInAEL = Next; } else { TEdge* Next = Edge1->NextInAEL; TEdge* Prev = Edge1->PrevInAEL; Edge1->NextInAEL = Edge2->NextInAEL; if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1; Edge1->PrevInAEL = Edge2->PrevInAEL; if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1; Edge2->NextInAEL = Next; if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2; Edge2->PrevInAEL = Prev; if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2; } if (!Edge1->PrevInAEL) m_ActiveEdges = Edge1; else if (!Edge2->PrevInAEL) m_ActiveEdges = Edge2; } //------------------------------------------------------------------------------ void ClipperBase::UpdateEdgeIntoAEL(TEdge *&e) { if (!e->NextInLML) throw clipperException("UpdateEdgeIntoAEL: invalid call"); e->NextInLML->OutIdx = e->OutIdx; TEdge* AelPrev = e->PrevInAEL; TEdge* AelNext = e->NextInAEL; if (AelPrev) AelPrev->NextInAEL = e->NextInLML; else m_ActiveEdges = e->NextInLML; if (AelNext) AelNext->PrevInAEL = e->NextInLML; e->NextInLML->Side = e->Side; e->NextInLML->WindDelta = e->WindDelta; e->NextInLML->WindCnt = e->WindCnt; e->NextInLML->WindCnt2 = e->WindCnt2; e = e->NextInLML; e->Curr = e->Bot; e->PrevInAEL = AelPrev; e->NextInAEL = AelNext; if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y); } //------------------------------------------------------------------------------ bool ClipperBase::LocalMinimaPending() { return (m_CurrentLM != m_MinimaList.end()); } //------------------------------------------------------------------------------ // TClipper methods ... //------------------------------------------------------------------------------ Clipper::Clipper(int initOptions) : ClipperBase() //constructor { m_ExecuteLocked = false; m_UseFullRange = false; m_ReverseOutput = ((initOptions & ioReverseSolution) != 0); m_StrictSimple = ((initOptions & ioStrictlySimple) != 0); m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0); m_HasOpenPaths = false; #ifdef use_xyz m_ZFill = 0; #endif } //------------------------------------------------------------------------------ #ifdef use_xyz void Clipper::ZFillFunction(ZFillCallback zFillFunc) { m_ZFill = zFillFunc; } //------------------------------------------------------------------------------ #endif bool Clipper::Execute(ClipType clipType, Paths &solution, PolyFillType fillType) { return Execute(clipType, solution, fillType, fillType); } //------------------------------------------------------------------------------ bool Clipper::Execute(ClipType clipType, PolyTree &polytree, PolyFillType fillType) { return Execute(clipType, polytree, fillType, fillType); } //------------------------------------------------------------------------------ bool Clipper::Execute(ClipType clipType, Paths &solution, PolyFillType subjFillType, PolyFillType clipFillType) { if( m_ExecuteLocked ) return false; if (m_HasOpenPaths) throw clipperException("Error: PolyTree struct is needed for open path clipping."); m_ExecuteLocked = true; solution.resize(0); m_SubjFillType = subjFillType; m_ClipFillType = clipFillType; m_ClipType = clipType; m_UsingPolyTree = false; bool succeeded = ExecuteInternal(); if (succeeded) BuildResult(solution); DisposeAllOutRecs(); m_ExecuteLocked = false; return succeeded; } //------------------------------------------------------------------------------ bool Clipper::Execute(ClipType clipType, PolyTree& polytree, PolyFillType subjFillType, PolyFillType clipFillType) { if( m_ExecuteLocked ) return false; m_ExecuteLocked = true; m_SubjFillType = subjFillType; m_ClipFillType = clipFillType; m_ClipType = clipType; m_UsingPolyTree = true; bool succeeded = ExecuteInternal(); if (succeeded) BuildResult2(polytree); DisposeAllOutRecs(); m_ExecuteLocked = false; return succeeded; } //------------------------------------------------------------------------------ void Clipper::FixHoleLinkage(OutRec &outrec) { //skip OutRecs that (a) contain outermost polygons or //(b) already have the correct owner/child linkage ... if (!outrec.FirstLeft || (outrec.IsHole != outrec.FirstLeft->IsHole && outrec.FirstLeft->Pts)) return; OutRec* orfl = outrec.FirstLeft; while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts)) orfl = orfl->FirstLeft; outrec.FirstLeft = orfl; } //------------------------------------------------------------------------------ bool Clipper::ExecuteInternal() { bool succeeded = true; try { Reset(); m_Maxima = MaximaList(); m_SortedEdges = 0; succeeded = true; cInt botY, topY; if (!PopScanbeam(botY)) return false; InsertLocalMinimaIntoAEL(botY); while (PopScanbeam(topY) || LocalMinimaPending()) { ProcessHorizontals(); ClearGhostJoins(); if (!ProcessIntersections(topY)) { succeeded = false; break; } ProcessEdgesAtTopOfScanbeam(topY); botY = topY; InsertLocalMinimaIntoAEL(botY); } } catch(...) { succeeded = false; } if (succeeded) { //fix orientations ... for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) { OutRec *outRec = m_PolyOuts[i]; if (!outRec->Pts || outRec->IsOpen) continue; if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0)) ReversePolyPtLinks(outRec->Pts); } if (!m_Joins.empty()) JoinCommonEdges(); //unfortunately FixupOutPolygon() must be done after JoinCommonEdges() for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) { OutRec *outRec = m_PolyOuts[i]; if (!outRec->Pts) continue; if (outRec->IsOpen) FixupOutPolyline(*outRec); else FixupOutPolygon(*outRec); } if (m_StrictSimple) DoSimplePolygons(); } ClearJoins(); ClearGhostJoins(); return succeeded; } //------------------------------------------------------------------------------ void Clipper::SetWindingCount(TEdge &edge) { TEdge *e = edge.PrevInAEL; //find the edge of the same polytype that immediately preceeds 'edge' in AEL while (e && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL; if (!e) { if (edge.WindDelta == 0) { PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType); edge.WindCnt = (pft == pftNegative ? -1 : 1); } else edge.WindCnt = edge.WindDelta; edge.WindCnt2 = 0; e = m_ActiveEdges; //ie get ready to calc WindCnt2 } else if (edge.WindDelta == 0 && m_ClipType != ctUnion) { edge.WindCnt = 1; edge.WindCnt2 = e->WindCnt2; e = e->NextInAEL; //ie get ready to calc WindCnt2 } else if (IsEvenOddFillType(edge)) { //EvenOdd filling ... if (edge.WindDelta == 0) { //are we inside a subj polygon ... bool Inside = true; TEdge *e2 = e->PrevInAEL; while (e2) { if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0) Inside = !Inside; e2 = e2->PrevInAEL; } edge.WindCnt = (Inside ? 0 : 1); } else { edge.WindCnt = edge.WindDelta; } edge.WindCnt2 = e->WindCnt2; e = e->NextInAEL; //ie get ready to calc WindCnt2 } else { //nonZero, Positive or Negative filling ... if (e->WindCnt * e->WindDelta < 0) { //prev edge is 'decreasing' WindCount (WC) toward zero //so we're outside the previous polygon ... if (Abs(e->WindCnt) > 1) { //outside prev poly but still inside another. //when reversing direction of prev poly use the same WC if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt; //otherwise continue to 'decrease' WC ... else edge.WindCnt = e->WindCnt + edge.WindDelta; } else //now outside all polys of same polytype so set own WC ... edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta); } else { //prev edge is 'increasing' WindCount (WC) away from zero //so we're inside the previous polygon ... if (edge.WindDelta == 0) edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1); //if wind direction is reversing prev then use same WC else if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt; //otherwise add to WC ... else edge.WindCnt = e->WindCnt + edge.WindDelta; } edge.WindCnt2 = e->WindCnt2; e = e->NextInAEL; //ie get ready to calc WindCnt2 } //update WindCnt2 ... if (IsEvenOddAltFillType(edge)) { //EvenOdd filling ... while (e != &edge) { if (e->WindDelta != 0) edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0); e = e->NextInAEL; } } else { //nonZero, Positive or Negative filling ... while ( e != &edge ) { edge.WindCnt2 += e->WindDelta; e = e->NextInAEL; } } } //------------------------------------------------------------------------------ bool Clipper::IsEvenOddFillType(const TEdge& edge) const { if (edge.PolyTyp == ptSubject) return m_SubjFillType == pftEvenOdd; else return m_ClipFillType == pftEvenOdd; } //------------------------------------------------------------------------------ bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const { if (edge.PolyTyp == ptSubject) return m_ClipFillType == pftEvenOdd; else return m_SubjFillType == pftEvenOdd; } //------------------------------------------------------------------------------ bool Clipper::IsContributing(const TEdge& edge) const { PolyFillType pft, pft2; if (edge.PolyTyp == ptSubject) { pft = m_SubjFillType; pft2 = m_ClipFillType; } else { pft = m_ClipFillType; pft2 = m_SubjFillType; } switch(pft) { case pftEvenOdd: //return false if a subj line has been flagged as inside a subj polygon if (edge.WindDelta == 0 && edge.WindCnt != 1) return false; break; case pftNonZero: if (Abs(edge.WindCnt) != 1) return false; break; case pftPositive: if (edge.WindCnt != 1) return false; break; default: //pftNegative if (edge.WindCnt != -1) return false; } switch(m_ClipType) { case ctIntersection: switch(pft2) { case pftEvenOdd: case pftNonZero: return (edge.WindCnt2 != 0); case pftPositive: return (edge.WindCnt2 > 0); default: return (edge.WindCnt2 < 0); } break; case ctUnion: switch(pft2) { case pftEvenOdd: case pftNonZero: return (edge.WindCnt2 == 0); case pftPositive: return (edge.WindCnt2 <= 0); default: return (edge.WindCnt2 >= 0); } break; case ctDifference: if (edge.PolyTyp == ptSubject) switch(pft2) { case pftEvenOdd: case pftNonZero: return (edge.WindCnt2 == 0); case pftPositive: return (edge.WindCnt2 <= 0); default: return (edge.WindCnt2 >= 0); } else switch(pft2) { case pftEvenOdd: case pftNonZero: return (edge.WindCnt2 != 0); case pftPositive: return (edge.WindCnt2 > 0); default: return (edge.WindCnt2 < 0); } break; case ctXor: if (edge.WindDelta == 0) //XOr always contributing unless open switch(pft2) { case pftEvenOdd: case pftNonZero: return (edge.WindCnt2 == 0); case pftPositive: return (edge.WindCnt2 <= 0); default: return (edge.WindCnt2 >= 0); } else return true; break; default: return true; } } //------------------------------------------------------------------------------ OutPt* Clipper::AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt) { OutPt* result; TEdge *e, *prevE; if (IsHorizontal(*e2) || ( e1->Dx > e2->Dx )) { result = AddOutPt(e1, Pt); e2->OutIdx = e1->OutIdx; e1->Side = esLeft; e2->Side = esRight; e = e1; if (e->PrevInAEL == e2) prevE = e2->PrevInAEL; else prevE = e->PrevInAEL; } else { result = AddOutPt(e2, Pt); e1->OutIdx = e2->OutIdx; e1->Side = esRight; e2->Side = esLeft; e = e2; if (e->PrevInAEL == e1) prevE = e1->PrevInAEL; else prevE = e->PrevInAEL; } if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y) { cInt xPrev = TopX(*prevE, Pt.Y); cInt xE = TopX(*e, Pt.Y); if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) && SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange)) { OutPt* outPt = AddOutPt(prevE, Pt); AddJoin(result, outPt, e->Top); } } return result; } //------------------------------------------------------------------------------ void Clipper::AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt) { AddOutPt( e1, Pt ); if (e2->WindDelta == 0) AddOutPt(e2, Pt); if( e1->OutIdx == e2->OutIdx ) { e1->OutIdx = Unassigned; e2->OutIdx = Unassigned; } else if (e1->OutIdx < e2->OutIdx) AppendPolygon(e1, e2); else AppendPolygon(e2, e1); } //------------------------------------------------------------------------------ void Clipper::AddEdgeToSEL(TEdge *edge) { //SEL pointers in PEdge are reused to build a list of horizontal edges. //However, we don't need to worry about order with horizontal edge processing. if( !m_SortedEdges ) { m_SortedEdges = edge; edge->PrevInSEL = 0; edge->NextInSEL = 0; } else { edge->NextInSEL = m_SortedEdges; edge->PrevInSEL = 0; m_SortedEdges->PrevInSEL = edge; m_SortedEdges = edge; } } //------------------------------------------------------------------------------ bool Clipper::PopEdgeFromSEL(TEdge *&edge) { if (!m_SortedEdges) return false; edge = m_SortedEdges; DeleteFromSEL(m_SortedEdges); return true; } //------------------------------------------------------------------------------ void Clipper::CopyAELToSEL() { TEdge* e = m_ActiveEdges; m_SortedEdges = e; while ( e ) { e->PrevInSEL = e->PrevInAEL; e->NextInSEL = e->NextInAEL; e = e->NextInAEL; } } //------------------------------------------------------------------------------ void Clipper::AddJoin(OutPt *op1, OutPt *op2, const IntPoint OffPt) { Join* j = new Join; j->OutPt1 = op1; j->OutPt2 = op2; j->OffPt = OffPt; m_Joins.push_back(j); } //------------------------------------------------------------------------------ void Clipper::ClearJoins() { for (JoinList::size_type i = 0; i < m_Joins.size(); i++) delete m_Joins[i]; m_Joins.resize(0); } //------------------------------------------------------------------------------ void Clipper::ClearGhostJoins() { for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++) delete m_GhostJoins[i]; m_GhostJoins.resize(0); } //------------------------------------------------------------------------------ void Clipper::AddGhostJoin(OutPt *op, const IntPoint OffPt) { Join* j = new Join; j->OutPt1 = op; j->OutPt2 = 0; j->OffPt = OffPt; m_GhostJoins.push_back(j); } //------------------------------------------------------------------------------ void Clipper::InsertLocalMinimaIntoAEL(const cInt botY) { const LocalMinimum *lm; while (PopLocalMinima(botY, lm)) { TEdge* lb = lm->LeftBound; TEdge* rb = lm->RightBound; OutPt *Op1 = 0; if (!lb) { //nb: don't insert LB into either AEL or SEL InsertEdgeIntoAEL(rb, 0); SetWindingCount(*rb); if (IsContributing(*rb)) Op1 = AddOutPt(rb, rb->Bot); } else if (!rb) { InsertEdgeIntoAEL(lb, 0); SetWindingCount(*lb); if (IsContributing(*lb)) Op1 = AddOutPt(lb, lb->Bot); InsertScanbeam(lb->Top.Y); } else { InsertEdgeIntoAEL(lb, 0); InsertEdgeIntoAEL(rb, lb); SetWindingCount( *lb ); rb->WindCnt = lb->WindCnt; rb->WindCnt2 = lb->WindCnt2; if (IsContributing(*lb)) Op1 = AddLocalMinPoly(lb, rb, lb->Bot); InsertScanbeam(lb->Top.Y); } if (rb) { if (IsHorizontal(*rb)) { AddEdgeToSEL(rb); if (rb->NextInLML) InsertScanbeam(rb->NextInLML->Top.Y); } else InsertScanbeam( rb->Top.Y ); } if (!lb || !rb) continue; //if any output polygons share an edge, they'll need joining later ... if (Op1 && IsHorizontal(*rb) && m_GhostJoins.size() > 0 && (rb->WindDelta != 0)) { for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i) { Join* jr = m_GhostJoins[i]; //if the horizontal Rb and a 'ghost' horizontal overlap, then convert //the 'ghost' join to a real join ready for later ... if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X)) AddJoin(jr->OutPt1, Op1, jr->OffPt); } } if (lb->OutIdx >= 0 && lb->PrevInAEL && lb->PrevInAEL->Curr.X == lb->Bot.X && lb->PrevInAEL->OutIdx >= 0 && SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) && (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0)) { OutPt *Op2 = AddOutPt(lb->PrevInAEL, lb->Bot); AddJoin(Op1, Op2, lb->Top); } if(lb->NextInAEL != rb) { if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 && SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) && (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0)) { OutPt *Op2 = AddOutPt(rb->PrevInAEL, rb->Bot); AddJoin(Op1, Op2, rb->Top); } TEdge* e = lb->NextInAEL; if (e) { while( e != rb ) { //nb: For calculating winding counts etc, IntersectEdges() assumes //that param1 will be to the Right of param2 ABOVE the intersection ... IntersectEdges(rb , e , lb->Curr); //order important here e = e->NextInAEL; } } } } } //------------------------------------------------------------------------------ void Clipper::DeleteFromSEL(TEdge *e) { TEdge* SelPrev = e->PrevInSEL; TEdge* SelNext = e->NextInSEL; if( !SelPrev && !SelNext && (e != m_SortedEdges) ) return; //already deleted if( SelPrev ) SelPrev->NextInSEL = SelNext; else m_SortedEdges = SelNext; if( SelNext ) SelNext->PrevInSEL = SelPrev; e->NextInSEL = 0; e->PrevInSEL = 0; } //------------------------------------------------------------------------------ #ifdef use_xyz void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2) { if (pt.Z != 0 || !m_ZFill) return; else if (pt == e1.Bot) pt.Z = e1.Bot.Z; else if (pt == e1.Top) pt.Z = e1.Top.Z; else if (pt == e2.Bot) pt.Z = e2.Bot.Z; else if (pt == e2.Top) pt.Z = e2.Top.Z; else (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt); } //------------------------------------------------------------------------------ #endif void Clipper::IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &Pt) { bool e1Contributing = ( e1->OutIdx >= 0 ); bool e2Contributing = ( e2->OutIdx >= 0 ); #ifdef use_xyz SetZ(Pt, *e1, *e2); #endif #ifdef use_lines //if either edge is on an OPEN path ... if (e1->WindDelta == 0 || e2->WindDelta == 0) { //ignore subject-subject open path intersections UNLESS they //are both open paths, AND they are both 'contributing maximas' ... if (e1->WindDelta == 0 && e2->WindDelta == 0) return; //if intersecting a subj line with a subj poly ... else if (e1->PolyTyp == e2->PolyTyp && e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion) { if (e1->WindDelta == 0) { if (e2Contributing) { AddOutPt(e1, Pt); if (e1Contributing) e1->OutIdx = Unassigned; } } else { if (e1Contributing) { AddOutPt(e2, Pt); if (e2Contributing) e2->OutIdx = Unassigned; } } } else if (e1->PolyTyp != e2->PolyTyp) { //toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ... if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 && (m_ClipType != ctUnion || e2->WindCnt2 == 0)) { AddOutPt(e1, Pt); if (e1Contributing) e1->OutIdx = Unassigned; } else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) && (m_ClipType != ctUnion || e1->WindCnt2 == 0)) { AddOutPt(e2, Pt); if (e2Contributing) e2->OutIdx = Unassigned; } } return; } #endif //update winding counts... //assumes that e1 will be to the Right of e2 ABOVE the intersection if ( e1->PolyTyp == e2->PolyTyp ) { if ( IsEvenOddFillType( *e1) ) { int oldE1WindCnt = e1->WindCnt; e1->WindCnt = e2->WindCnt; e2->WindCnt = oldE1WindCnt; } else { if (e1->WindCnt + e2->WindDelta == 0 ) e1->WindCnt = -e1->WindCnt; else e1->WindCnt += e2->WindDelta; if ( e2->WindCnt - e1->WindDelta == 0 ) e2->WindCnt = -e2->WindCnt; else e2->WindCnt -= e1->WindDelta; } } else { if (!IsEvenOddFillType(*e2)) e1->WindCnt2 += e2->WindDelta; else e1->WindCnt2 = ( e1->WindCnt2 == 0 ) ? 1 : 0; if (!IsEvenOddFillType(*e1)) e2->WindCnt2 -= e1->WindDelta; else e2->WindCnt2 = ( e2->WindCnt2 == 0 ) ? 1 : 0; } PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2; if (e1->PolyTyp == ptSubject) { e1FillType = m_SubjFillType; e1FillType2 = m_ClipFillType; } else { e1FillType = m_ClipFillType; e1FillType2 = m_SubjFillType; } if (e2->PolyTyp == ptSubject) { e2FillType = m_SubjFillType; e2FillType2 = m_ClipFillType; } else { e2FillType = m_ClipFillType; e2FillType2 = m_SubjFillType; } cInt e1Wc, e2Wc; switch (e1FillType) { case pftPositive: e1Wc = e1->WindCnt; break; case pftNegative: e1Wc = -e1->WindCnt; break; default: e1Wc = Abs(e1->WindCnt); } switch(e2FillType) { case pftPositive: e2Wc = e2->WindCnt; break; case pftNegative: e2Wc = -e2->WindCnt; break; default: e2Wc = Abs(e2->WindCnt); } if ( e1Contributing && e2Contributing ) { if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) || (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor) ) { AddLocalMaxPoly(e1, e2, Pt); } else { AddOutPt(e1, Pt); AddOutPt(e2, Pt); SwapSides( *e1 , *e2 ); SwapPolyIndexes( *e1 , *e2 ); } } else if ( e1Contributing ) { if (e2Wc == 0 || e2Wc == 1) { AddOutPt(e1, Pt); SwapSides(*e1, *e2); SwapPolyIndexes(*e1, *e2); } } else if ( e2Contributing ) { if (e1Wc == 0 || e1Wc == 1) { AddOutPt(e2, Pt); SwapSides(*e1, *e2); SwapPolyIndexes(*e1, *e2); } } else if ( (e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1)) { //neither edge is currently contributing ... cInt e1Wc2, e2Wc2; switch (e1FillType2) { case pftPositive: e1Wc2 = e1->WindCnt2; break; case pftNegative : e1Wc2 = -e1->WindCnt2; break; default: e1Wc2 = Abs(e1->WindCnt2); } switch (e2FillType2) { case pftPositive: e2Wc2 = e2->WindCnt2; break; case pftNegative: e2Wc2 = -e2->WindCnt2; break; default: e2Wc2 = Abs(e2->WindCnt2); } if (e1->PolyTyp != e2->PolyTyp) { AddLocalMinPoly(e1, e2, Pt); } else if (e1Wc == 1 && e2Wc == 1) switch( m_ClipType ) { case ctIntersection: if (e1Wc2 > 0 && e2Wc2 > 0) AddLocalMinPoly(e1, e2, Pt); break; case ctUnion: if ( e1Wc2 <= 0 && e2Wc2 <= 0 ) AddLocalMinPoly(e1, e2, Pt); break; case ctDifference: if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) || ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0))) AddLocalMinPoly(e1, e2, Pt); break; case ctXor: AddLocalMinPoly(e1, e2, Pt); } else SwapSides( *e1, *e2 ); } } //------------------------------------------------------------------------------ void Clipper::SetHoleState(TEdge *e, OutRec *outrec) { TEdge *e2 = e->PrevInAEL; TEdge *eTmp = 0; while (e2) { if (e2->OutIdx >= 0 && e2->WindDelta != 0) { if (!eTmp) eTmp = e2; else if (eTmp->OutIdx == e2->OutIdx) eTmp = 0; } e2 = e2->PrevInAEL; } if (!eTmp) { outrec->FirstLeft = 0; outrec->IsHole = false; } else { outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx]; outrec->IsHole = !outrec->FirstLeft->IsHole; } } //------------------------------------------------------------------------------ OutRec* GetLowermostRec(OutRec *outRec1, OutRec *outRec2) { //work out which polygon fragment has the correct hole state ... if (!outRec1->BottomPt) outRec1->BottomPt = GetBottomPt(outRec1->Pts); if (!outRec2->BottomPt) outRec2->BottomPt = GetBottomPt(outRec2->Pts); OutPt *OutPt1 = outRec1->BottomPt; OutPt *OutPt2 = outRec2->BottomPt; if (OutPt1->Pt.Y > OutPt2->Pt.Y) return outRec1; else if (OutPt1->Pt.Y < OutPt2->Pt.Y) return outRec2; else if (OutPt1->Pt.X < OutPt2->Pt.X) return outRec1; else if (OutPt1->Pt.X > OutPt2->Pt.X) return outRec2; else if (OutPt1->Next == OutPt1) return outRec2; else if (OutPt2->Next == OutPt2) return outRec1; else if (FirstIsBottomPt(OutPt1, OutPt2)) return outRec1; else return outRec2; } //------------------------------------------------------------------------------ bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2) { do { outRec1 = outRec1->FirstLeft; if (outRec1 == outRec2) return true; } while (outRec1); return false; } //------------------------------------------------------------------------------ OutRec* Clipper::GetOutRec(int Idx) { OutRec* outrec = m_PolyOuts[Idx]; while (outrec != m_PolyOuts[outrec->Idx]) outrec = m_PolyOuts[outrec->Idx]; return outrec; } //------------------------------------------------------------------------------ void Clipper::AppendPolygon(TEdge *e1, TEdge *e2) { //get the start and ends of both output polygons ... OutRec *outRec1 = m_PolyOuts[e1->OutIdx]; OutRec *outRec2 = m_PolyOuts[e2->OutIdx]; OutRec *holeStateRec; if (OutRec1RightOfOutRec2(outRec1, outRec2)) holeStateRec = outRec2; else if (OutRec1RightOfOutRec2(outRec2, outRec1)) holeStateRec = outRec1; else holeStateRec = GetLowermostRec(outRec1, outRec2); //get the start and ends of both output polygons and //join e2 poly onto e1 poly and delete pointers to e2 ... OutPt* p1_lft = outRec1->Pts; OutPt* p1_rt = p1_lft->Prev; OutPt* p2_lft = outRec2->Pts; OutPt* p2_rt = p2_lft->Prev; //join e2 poly onto e1 poly and delete pointers to e2 ... if( e1->Side == esLeft ) { if( e2->Side == esLeft ) { //z y x a b c ReversePolyPtLinks(p2_lft); p2_lft->Next = p1_lft; p1_lft->Prev = p2_lft; p1_rt->Next = p2_rt; p2_rt->Prev = p1_rt; outRec1->Pts = p2_rt; } else { //x y z a b c p2_rt->Next = p1_lft; p1_lft->Prev = p2_rt; p2_lft->Prev = p1_rt; p1_rt->Next = p2_lft; outRec1->Pts = p2_lft; } } else { if( e2->Side == esRight ) { //a b c z y x ReversePolyPtLinks(p2_lft); p1_rt->Next = p2_rt; p2_rt->Prev = p1_rt; p2_lft->Next = p1_lft; p1_lft->Prev = p2_lft; } else { //a b c x y z p1_rt->Next = p2_lft; p2_lft->Prev = p1_rt; p1_lft->Prev = p2_rt; p2_rt->Next = p1_lft; } } outRec1->BottomPt = 0; if (holeStateRec == outRec2) { if (outRec2->FirstLeft != outRec1) outRec1->FirstLeft = outRec2->FirstLeft; outRec1->IsHole = outRec2->IsHole; } outRec2->Pts = 0; outRec2->BottomPt = 0; outRec2->FirstLeft = outRec1; int OKIdx = e1->OutIdx; int ObsoleteIdx = e2->OutIdx; e1->OutIdx = Unassigned; //nb: safe because we only get here via AddLocalMaxPoly e2->OutIdx = Unassigned; TEdge* e = m_ActiveEdges; while( e ) { if( e->OutIdx == ObsoleteIdx ) { e->OutIdx = OKIdx; e->Side = e1->Side; break; } e = e->NextInAEL; } outRec2->Idx = outRec1->Idx; } //------------------------------------------------------------------------------ OutPt* Clipper::AddOutPt(TEdge *e, const IntPoint &pt) { if( e->OutIdx < 0 ) { OutRec *outRec = CreateOutRec(); outRec->IsOpen = (e->WindDelta == 0); OutPt* newOp = new OutPt; outRec->Pts = newOp; newOp->Idx = outRec->Idx; newOp->Pt = pt; newOp->Next = newOp; newOp->Prev = newOp; if (!outRec->IsOpen) SetHoleState(e, outRec); e->OutIdx = outRec->Idx; return newOp; } else { OutRec *outRec = m_PolyOuts[e->OutIdx]; //OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most' OutPt* op = outRec->Pts; bool ToFront = (e->Side == esLeft); if (ToFront && (pt == op->Pt)) return op; else if (!ToFront && (pt == op->Prev->Pt)) return op->Prev; OutPt* newOp = new OutPt; newOp->Idx = outRec->Idx; newOp->Pt = pt; newOp->Next = op; newOp->Prev = op->Prev; newOp->Prev->Next = newOp; op->Prev = newOp; if (ToFront) outRec->Pts = newOp; return newOp; } } //------------------------------------------------------------------------------ OutPt* Clipper::GetLastOutPt(TEdge *e) { OutRec *outRec = m_PolyOuts[e->OutIdx]; if (e->Side == esLeft) return outRec->Pts; else return outRec->Pts->Prev; } //------------------------------------------------------------------------------ void Clipper::ProcessHorizontals() { TEdge* horzEdge; while (PopEdgeFromSEL(horzEdge)) ProcessHorizontal(horzEdge); } //------------------------------------------------------------------------------ inline bool IsMinima(TEdge *e) { return e && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e); } //------------------------------------------------------------------------------ inline bool IsMaxima(TEdge *e, const cInt Y) { return e && e->Top.Y == Y && !e->NextInLML; } //------------------------------------------------------------------------------ inline bool IsIntermediate(TEdge *e, const cInt Y) { return e->Top.Y == Y && e->NextInLML; } //------------------------------------------------------------------------------ TEdge *GetMaximaPair(TEdge *e) { if ((e->Next->Top == e->Top) && !e->Next->NextInLML) return e->Next; else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML) return e->Prev; else return 0; } //------------------------------------------------------------------------------ TEdge *GetMaximaPairEx(TEdge *e) { //as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal) TEdge* result = GetMaximaPair(e); if (result && (result->OutIdx == Skip || (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0; return result; } //------------------------------------------------------------------------------ void Clipper::SwapPositionsInSEL(TEdge *Edge1, TEdge *Edge2) { if( !( Edge1->NextInSEL ) && !( Edge1->PrevInSEL ) ) return; if( !( Edge2->NextInSEL ) && !( Edge2->PrevInSEL ) ) return; if( Edge1->NextInSEL == Edge2 ) { TEdge* Next = Edge2->NextInSEL; if( Next ) Next->PrevInSEL = Edge1; TEdge* Prev = Edge1->PrevInSEL; if( Prev ) Prev->NextInSEL = Edge2; Edge2->PrevInSEL = Prev; Edge2->NextInSEL = Edge1; Edge1->PrevInSEL = Edge2; Edge1->NextInSEL = Next; } else if( Edge2->NextInSEL == Edge1 ) { TEdge* Next = Edge1->NextInSEL; if( Next ) Next->PrevInSEL = Edge2; TEdge* Prev = Edge2->PrevInSEL; if( Prev ) Prev->NextInSEL = Edge1; Edge1->PrevInSEL = Prev; Edge1->NextInSEL = Edge2; Edge2->PrevInSEL = Edge1; Edge2->NextInSEL = Next; } else { TEdge* Next = Edge1->NextInSEL; TEdge* Prev = Edge1->PrevInSEL; Edge1->NextInSEL = Edge2->NextInSEL; if( Edge1->NextInSEL ) Edge1->NextInSEL->PrevInSEL = Edge1; Edge1->PrevInSEL = Edge2->PrevInSEL; if( Edge1->PrevInSEL ) Edge1->PrevInSEL->NextInSEL = Edge1; Edge2->NextInSEL = Next; if( Edge2->NextInSEL ) Edge2->NextInSEL->PrevInSEL = Edge2; Edge2->PrevInSEL = Prev; if( Edge2->PrevInSEL ) Edge2->PrevInSEL->NextInSEL = Edge2; } if( !Edge1->PrevInSEL ) m_SortedEdges = Edge1; else if( !Edge2->PrevInSEL ) m_SortedEdges = Edge2; } //------------------------------------------------------------------------------ TEdge* GetNextInAEL(TEdge *e, Direction dir) { return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL; } //------------------------------------------------------------------------------ void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right) { if (HorzEdge.Bot.X < HorzEdge.Top.X) { Left = HorzEdge.Bot.X; Right = HorzEdge.Top.X; Dir = dLeftToRight; } else { Left = HorzEdge.Top.X; Right = HorzEdge.Bot.X; Dir = dRightToLeft; } } //------------------------------------------------------------------------ /******************************************************************************* * Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or * * Bottom of a scanbeam) are processed as if layered. The order in which HEs * * are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#] * * (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs), * * and with other non-horizontal edges [*]. Once these intersections are * * processed, intermediate HEs then 'promote' the Edge above (NextInLML) into * * the AEL. These 'promoted' edges may in turn intersect [%] with other HEs. * *******************************************************************************/ void Clipper::ProcessHorizontal(TEdge *horzEdge) { Direction dir; cInt horzLeft, horzRight; bool IsOpen = (horzEdge->WindDelta == 0); GetHorzDirection(*horzEdge, dir, horzLeft, horzRight); TEdge* eLastHorz = horzEdge, *eMaxPair = 0; while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML)) eLastHorz = eLastHorz->NextInLML; if (!eLastHorz->NextInLML) eMaxPair = GetMaximaPair(eLastHorz); MaximaList::const_iterator maxIt; MaximaList::const_reverse_iterator maxRit; if (m_Maxima.size() > 0) { //get the first maxima in range (X) ... if (dir == dLeftToRight) { maxIt = m_Maxima.begin(); while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++; if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X) maxIt = m_Maxima.end(); } else { maxRit = m_Maxima.rbegin(); while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++; if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X) maxRit = m_Maxima.rend(); } } OutPt* op1 = 0; for (;;) //loop through consec. horizontal edges { bool IsLastHorz = (horzEdge == eLastHorz); TEdge* e = GetNextInAEL(horzEdge, dir); while(e) { //this code block inserts extra coords into horizontal edges (in output //polygons) whereever maxima touch these horizontal edges. This helps //'simplifying' polygons (ie if the Simplify property is set). if (m_Maxima.size() > 0) { if (dir == dLeftToRight) { while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X) { if (horzEdge->OutIdx >= 0 && !IsOpen) AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y)); maxIt++; } } else { while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X) { if (horzEdge->OutIdx >= 0 && !IsOpen) AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y)); maxRit++; } } }; if ((dir == dLeftToRight && e->Curr.X > horzRight) || (dir == dRightToLeft && e->Curr.X < horzLeft)) break; //Also break if we've got to the end of an intermediate horizontal edge ... //nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal. if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML && e->Dx < horzEdge->NextInLML->Dx) break; if (horzEdge->OutIdx >= 0 && !IsOpen) //note: may be done multiple times { #ifdef use_xyz if (dir == dLeftToRight) SetZ(e->Curr, *horzEdge, *e); else SetZ(e->Curr, *e, *horzEdge); #endif op1 = AddOutPt(horzEdge, e->Curr); TEdge* eNextHorz = m_SortedEdges; while (eNextHorz) { if (eNextHorz->OutIdx >= 0 && HorzSegmentsOverlap(horzEdge->Bot.X, horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X)) { OutPt* op2 = GetLastOutPt(eNextHorz); AddJoin(op2, op1, eNextHorz->Top); } eNextHorz = eNextHorz->NextInSEL; } AddGhostJoin(op1, horzEdge->Bot); } //OK, so far we're still in range of the horizontal Edge but make sure //we're at the last of consec. horizontals when matching with eMaxPair if(e == eMaxPair && IsLastHorz) { if (horzEdge->OutIdx >= 0) AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top); DeleteFromAEL(horzEdge); DeleteFromAEL(eMaxPair); return; } if(dir == dLeftToRight) { IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y); IntersectEdges(horzEdge, e, Pt); } else { IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y); IntersectEdges( e, horzEdge, Pt); } TEdge* eNext = GetNextInAEL(e, dir); SwapPositionsInAEL( horzEdge, e ); e = eNext; } //end while(e) //Break out of loop if HorzEdge.NextInLML is not also horizontal ... if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break; UpdateEdgeIntoAEL(horzEdge); if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot); GetHorzDirection(*horzEdge, dir, horzLeft, horzRight); } //end for (;;) if (horzEdge->OutIdx >= 0 && !op1) { op1 = GetLastOutPt(horzEdge); TEdge* eNextHorz = m_SortedEdges; while (eNextHorz) { if (eNextHorz->OutIdx >= 0 && HorzSegmentsOverlap(horzEdge->Bot.X, horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X)) { OutPt* op2 = GetLastOutPt(eNextHorz); AddJoin(op2, op1, eNextHorz->Top); } eNextHorz = eNextHorz->NextInSEL; } AddGhostJoin(op1, horzEdge->Top); } if (horzEdge->NextInLML) { if(horzEdge->OutIdx >= 0) { op1 = AddOutPt( horzEdge, horzEdge->Top); UpdateEdgeIntoAEL(horzEdge); if (horzEdge->WindDelta == 0) return; //nb: HorzEdge is no longer horizontal here TEdge* ePrev = horzEdge->PrevInAEL; TEdge* eNext = horzEdge->NextInAEL; if (ePrev && ePrev->Curr.X == horzEdge->Bot.X && ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 && (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y && SlopesEqual(*horzEdge, *ePrev, m_UseFullRange))) { OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot); AddJoin(op1, op2, horzEdge->Top); } else if (eNext && eNext->Curr.X == horzEdge->Bot.X && eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 && eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y && SlopesEqual(*horzEdge, *eNext, m_UseFullRange)) { OutPt* op2 = AddOutPt(eNext, horzEdge->Bot); AddJoin(op1, op2, horzEdge->Top); } } else UpdateEdgeIntoAEL(horzEdge); } else { if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top); DeleteFromAEL(horzEdge); } } //------------------------------------------------------------------------------ bool Clipper::ProcessIntersections(const cInt topY) { if( !m_ActiveEdges ) return true; try { BuildIntersectList(topY); size_t IlSize = m_IntersectList.size(); if (IlSize == 0) return true; if (IlSize == 1 || FixupIntersectionOrder()) ProcessIntersectList(); else return false; } catch(...) { m_SortedEdges = 0; DisposeIntersectNodes(); throw clipperException("ProcessIntersections error"); } m_SortedEdges = 0; return true; } //------------------------------------------------------------------------------ void Clipper::DisposeIntersectNodes() { for (size_t i = 0; i < m_IntersectList.size(); ++i ) delete m_IntersectList[i]; m_IntersectList.clear(); } //------------------------------------------------------------------------------ void Clipper::BuildIntersectList(const cInt topY) { if ( !m_ActiveEdges ) return; //prepare for sorting ... TEdge* e = m_ActiveEdges; m_SortedEdges = e; while( e ) { e->PrevInSEL = e->PrevInAEL; e->NextInSEL = e->NextInAEL; e->Curr.X = TopX( *e, topY ); e = e->NextInAEL; } //bubblesort ... bool isModified; do { isModified = false; e = m_SortedEdges; while( e->NextInSEL ) { TEdge *eNext = e->NextInSEL; IntPoint Pt; if(e->Curr.X > eNext->Curr.X) { IntersectPoint(*e, *eNext, Pt); if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY); IntersectNode * newNode = new IntersectNode; newNode->Edge1 = e; newNode->Edge2 = eNext; newNode->Pt = Pt; m_IntersectList.push_back(newNode); SwapPositionsInSEL(e, eNext); isModified = true; } else e = eNext; } if( e->PrevInSEL ) e->PrevInSEL->NextInSEL = 0; else break; } while ( isModified ); m_SortedEdges = 0; //important } //------------------------------------------------------------------------------ void Clipper::ProcessIntersectList() { for (size_t i = 0; i < m_IntersectList.size(); ++i) { IntersectNode* iNode = m_IntersectList[i]; { IntersectEdges( iNode->Edge1, iNode->Edge2, iNode->Pt); SwapPositionsInAEL( iNode->Edge1 , iNode->Edge2 ); } delete iNode; } m_IntersectList.clear(); } //------------------------------------------------------------------------------ bool IntersectListSort(IntersectNode* node1, IntersectNode* node2) { return node2->Pt.Y < node1->Pt.Y; } //------------------------------------------------------------------------------ inline bool EdgesAdjacent(const IntersectNode &inode) { return (inode.Edge1->NextInSEL == inode.Edge2) || (inode.Edge1->PrevInSEL == inode.Edge2); } //------------------------------------------------------------------------------ bool Clipper::FixupIntersectionOrder() { //pre-condition: intersections are sorted Bottom-most first. //Now it's crucial that intersections are made only between adjacent edges, //so to ensure this the order of intersections may need adjusting ... CopyAELToSEL(); std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort); size_t cnt = m_IntersectList.size(); for (size_t i = 0; i < cnt; ++i) { if (!EdgesAdjacent(*m_IntersectList[i])) { size_t j = i + 1; while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++; if (j == cnt) return false; std::swap(m_IntersectList[i], m_IntersectList[j]); } SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2); } return true; } //------------------------------------------------------------------------------ void Clipper::DoMaxima(TEdge *e) { TEdge* eMaxPair = GetMaximaPairEx(e); if (!eMaxPair) { if (e->OutIdx >= 0) AddOutPt(e, e->Top); DeleteFromAEL(e); return; } TEdge* eNext = e->NextInAEL; while(eNext && eNext != eMaxPair) { IntersectEdges(e, eNext, e->Top); SwapPositionsInAEL(e, eNext); eNext = e->NextInAEL; } if(e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned) { DeleteFromAEL(e); DeleteFromAEL(eMaxPair); } else if( e->OutIdx >= 0 && eMaxPair->OutIdx >= 0 ) { if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top); DeleteFromAEL(e); DeleteFromAEL(eMaxPair); } #ifdef use_lines else if (e->WindDelta == 0) { if (e->OutIdx >= 0) { AddOutPt(e, e->Top); e->OutIdx = Unassigned; } DeleteFromAEL(e); if (eMaxPair->OutIdx >= 0) { AddOutPt(eMaxPair, e->Top); eMaxPair->OutIdx = Unassigned; } DeleteFromAEL(eMaxPair); } #endif else throw clipperException("DoMaxima error"); } //------------------------------------------------------------------------------ void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY) { TEdge* e = m_ActiveEdges; while( e ) { //1. process maxima, treating them as if they're 'bent' horizontal edges, // but exclude maxima with horizontal edges. nb: e can't be a horizontal. bool IsMaximaEdge = IsMaxima(e, topY); if(IsMaximaEdge) { TEdge* eMaxPair = GetMaximaPairEx(e); IsMaximaEdge = (!eMaxPair || !IsHorizontal(*eMaxPair)); } if(IsMaximaEdge) { if (m_StrictSimple) m_Maxima.push_back(e->Top.X); TEdge* ePrev = e->PrevInAEL; DoMaxima(e); if( !ePrev ) e = m_ActiveEdges; else e = ePrev->NextInAEL; } else { //2. promote horizontal edges, otherwise update Curr.X and Curr.Y ... if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML)) { UpdateEdgeIntoAEL(e); if (e->OutIdx >= 0) AddOutPt(e, e->Bot); AddEdgeToSEL(e); } else { e->Curr.X = TopX( *e, topY ); e->Curr.Y = topY; #ifdef use_xyz e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0); #endif } //When StrictlySimple and 'e' is being touched by another edge, then //make sure both edges have a vertex here ... if (m_StrictSimple) { TEdge* ePrev = e->PrevInAEL; if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) && (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0)) { IntPoint pt = e->Curr; #ifdef use_xyz SetZ(pt, *ePrev, *e); #endif OutPt* op = AddOutPt(ePrev, pt); OutPt* op2 = AddOutPt(e, pt); AddJoin(op, op2, pt); //StrictlySimple (type-3) join } } e = e->NextInAEL; } } //3. Process horizontals at the Top of the scanbeam ... m_Maxima.sort(); ProcessHorizontals(); m_Maxima.clear(); //4. Promote intermediate vertices ... e = m_ActiveEdges; while(e) { if(IsIntermediate(e, topY)) { OutPt* op = 0; if( e->OutIdx >= 0 ) op = AddOutPt(e, e->Top); UpdateEdgeIntoAEL(e); //if output polygons share an edge, they'll need joining later ... TEdge* ePrev = e->PrevInAEL; TEdge* eNext = e->NextInAEL; if (ePrev && ePrev->Curr.X == e->Bot.X && ePrev->Curr.Y == e->Bot.Y && op && ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y && SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) && (e->WindDelta != 0) && (ePrev->WindDelta != 0)) { OutPt* op2 = AddOutPt(ePrev, e->Bot); AddJoin(op, op2, e->Top); } else if (eNext && eNext->Curr.X == e->Bot.X && eNext->Curr.Y == e->Bot.Y && op && eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y && SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) && (e->WindDelta != 0) && (eNext->WindDelta != 0)) { OutPt* op2 = AddOutPt(eNext, e->Bot); AddJoin(op, op2, e->Top); } } e = e->NextInAEL; } } //------------------------------------------------------------------------------ void Clipper::FixupOutPolyline(OutRec &outrec) { OutPt *pp = outrec.Pts; OutPt *lastPP = pp->Prev; while (pp != lastPP) { pp = pp->Next; if (pp->Pt == pp->Prev->Pt) { if (pp == lastPP) lastPP = pp->Prev; OutPt *tmpPP = pp->Prev; tmpPP->Next = pp->Next; pp->Next->Prev = tmpPP; delete pp; pp = tmpPP; } } if (pp == pp->Prev) { DisposeOutPts(pp); outrec.Pts = 0; return; } } //------------------------------------------------------------------------------ void Clipper::FixupOutPolygon(OutRec &outrec) { //FixupOutPolygon() - removes duplicate points and simplifies consecutive //parallel edges by removing the middle vertex. OutPt *lastOK = 0; outrec.BottomPt = 0; OutPt *pp = outrec.Pts; bool preserveCol = m_PreserveCollinear || m_StrictSimple; for (;;) { if (pp->Prev == pp || pp->Prev == pp->Next) { DisposeOutPts(pp); outrec.Pts = 0; return; } //test for duplicate points and collinear edges ... if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) || (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) && (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt)))) { lastOK = 0; OutPt *tmp = pp; pp->Prev->Next = pp->Next; pp->Next->Prev = pp->Prev; pp = pp->Prev; delete tmp; } else if (pp == lastOK) break; else { if (!lastOK) lastOK = pp; pp = pp->Next; } } outrec.Pts = pp; } //------------------------------------------------------------------------------ int PointCount(OutPt *Pts) { if (!Pts) return 0; int result = 0; OutPt* p = Pts; do { result++; p = p->Next; } while (p != Pts); return result; } //------------------------------------------------------------------------------ void Clipper::BuildResult(Paths &polys) { polys.reserve(m_PolyOuts.size()); for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) { if (!m_PolyOuts[i]->Pts) continue; Path pg; OutPt* p = m_PolyOuts[i]->Pts->Prev; int cnt = PointCount(p); if (cnt < 2) continue; pg.reserve(cnt); for (int i = 0; i < cnt; ++i) { pg.push_back(p->Pt); p = p->Prev; } polys.push_back(pg); } } //------------------------------------------------------------------------------ void Clipper::BuildResult2(PolyTree& polytree) { polytree.Clear(); polytree.AllNodes.reserve(m_PolyOuts.size()); //add each output polygon/contour to polytree ... for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++) { OutRec* outRec = m_PolyOuts[i]; int cnt = PointCount(outRec->Pts); if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue; FixHoleLinkage(*outRec); PolyNode* pn = new PolyNode(); //nb: polytree takes ownership of all the PolyNodes polytree.AllNodes.push_back(pn); outRec->PolyNd = pn; pn->Parent = 0; pn->Index = 0; pn->Contour.reserve(cnt); OutPt *op = outRec->Pts->Prev; for (int j = 0; j < cnt; j++) { pn->Contour.push_back(op->Pt); op = op->Prev; } } //fixup PolyNode links etc ... polytree.Childs.reserve(m_PolyOuts.size()); for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++) { OutRec* outRec = m_PolyOuts[i]; if (!outRec->PolyNd) continue; if (outRec->IsOpen) { outRec->PolyNd->m_IsOpen = true; polytree.AddChild(*outRec->PolyNd); } else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd) outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd); else polytree.AddChild(*outRec->PolyNd); } } //------------------------------------------------------------------------------ void SwapIntersectNodes(IntersectNode &int1, IntersectNode &int2) { //just swap the contents (because fIntersectNodes is a single-linked-list) IntersectNode inode = int1; //gets a copy of Int1 int1.Edge1 = int2.Edge1; int1.Edge2 = int2.Edge2; int1.Pt = int2.Pt; int2.Edge1 = inode.Edge1; int2.Edge2 = inode.Edge2; int2.Pt = inode.Pt; } //------------------------------------------------------------------------------ inline bool E2InsertsBeforeE1(TEdge &e1, TEdge &e2) { if (e2.Curr.X == e1.Curr.X) { if (e2.Top.Y > e1.Top.Y) return e2.Top.X < TopX(e1, e2.Top.Y); else return e1.Top.X > TopX(e2, e1.Top.Y); } else return e2.Curr.X < e1.Curr.X; } //------------------------------------------------------------------------------ bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2, cInt& Left, cInt& Right) { if (a1 < a2) { if (b1 < b2) {Left = std::max(a1,b1); Right = std::min(a2,b2);} else {Left = std::max(a1,b2); Right = std::min(a2,b1);} } else { if (b1 < b2) {Left = std::max(a2,b1); Right = std::min(a1,b2);} else {Left = std::max(a2,b2); Right = std::min(a1,b1);} } return Left < Right; } //------------------------------------------------------------------------------ inline void UpdateOutPtIdxs(OutRec& outrec) { OutPt* op = outrec.Pts; do { op->Idx = outrec.Idx; op = op->Prev; } while(op != outrec.Pts); } //------------------------------------------------------------------------------ void Clipper::InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge) { if(!m_ActiveEdges) { edge->PrevInAEL = 0; edge->NextInAEL = 0; m_ActiveEdges = edge; } else if(!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge)) { edge->PrevInAEL = 0; edge->NextInAEL = m_ActiveEdges; m_ActiveEdges->PrevInAEL = edge; m_ActiveEdges = edge; } else { if(!startEdge) startEdge = m_ActiveEdges; while(startEdge->NextInAEL && !E2InsertsBeforeE1(*startEdge->NextInAEL , *edge)) startEdge = startEdge->NextInAEL; edge->NextInAEL = startEdge->NextInAEL; if(startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge; edge->PrevInAEL = startEdge; startEdge->NextInAEL = edge; } } //---------------------------------------------------------------------- OutPt* DupOutPt(OutPt* outPt, bool InsertAfter) { OutPt* result = new OutPt; result->Pt = outPt->Pt; result->Idx = outPt->Idx; if (InsertAfter) { result->Next = outPt->Next; result->Prev = outPt; outPt->Next->Prev = result; outPt->Next = result; } else { result->Prev = outPt->Prev; result->Next = outPt; outPt->Prev->Next = result; outPt->Prev = result; } return result; } //------------------------------------------------------------------------------ bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b, const IntPoint Pt, bool DiscardLeft) { Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight); Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight); if (Dir1 == Dir2) return false; //When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we //want Op1b to be on the Right. (And likewise with Op2 and Op2b.) //So, to facilitate this while inserting Op1b and Op2b ... //when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b, //otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.) if (Dir1 == dLeftToRight) { while (op1->Next->Pt.X <= Pt.X && op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y) op1 = op1->Next; if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next; op1b = DupOutPt(op1, !DiscardLeft); if (op1b->Pt != Pt) { op1 = op1b; op1->Pt = Pt; op1b = DupOutPt(op1, !DiscardLeft); } } else { while (op1->Next->Pt.X >= Pt.X && op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y) op1 = op1->Next; if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next; op1b = DupOutPt(op1, DiscardLeft); if (op1b->Pt != Pt) { op1 = op1b; op1->Pt = Pt; op1b = DupOutPt(op1, DiscardLeft); } } if (Dir2 == dLeftToRight) { while (op2->Next->Pt.X <= Pt.X && op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y) op2 = op2->Next; if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next; op2b = DupOutPt(op2, !DiscardLeft); if (op2b->Pt != Pt) { op2 = op2b; op2->Pt = Pt; op2b = DupOutPt(op2, !DiscardLeft); }; } else { while (op2->Next->Pt.X >= Pt.X && op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y) op2 = op2->Next; if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next; op2b = DupOutPt(op2, DiscardLeft); if (op2b->Pt != Pt) { op2 = op2b; op2->Pt = Pt; op2b = DupOutPt(op2, DiscardLeft); }; }; if ((Dir1 == dLeftToRight) == DiscardLeft) { op1->Prev = op2; op2->Next = op1; op1b->Next = op2b; op2b->Prev = op1b; } else { op1->Next = op2; op2->Prev = op1; op1b->Prev = op2b; op2b->Next = op1b; } return true; } //------------------------------------------------------------------------------ bool Clipper::JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2) { OutPt *op1 = j->OutPt1, *op1b; OutPt *op2 = j->OutPt2, *op2b; //There are 3 kinds of joins for output polygons ... //1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere //along (horizontal) collinear edges (& Join.OffPt is on the same horizontal). //2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same //location at the Bottom of the overlapping segment (& Join.OffPt is above). //3. StrictSimple joins where edges touch but are not collinear and where //Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point. bool isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y); if (isHorizontal && (j->OffPt == j->OutPt1->Pt) && (j->OffPt == j->OutPt2->Pt)) { //Strictly Simple join ... if (outRec1 != outRec2) return false; op1b = j->OutPt1->Next; while (op1b != op1 && (op1b->Pt == j->OffPt)) op1b = op1b->Next; bool reverse1 = (op1b->Pt.Y > j->OffPt.Y); op2b = j->OutPt2->Next; while (op2b != op2 && (op2b->Pt == j->OffPt)) op2b = op2b->Next; bool reverse2 = (op2b->Pt.Y > j->OffPt.Y); if (reverse1 == reverse2) return false; if (reverse1) { op1b = DupOutPt(op1, false); op2b = DupOutPt(op2, true); op1->Prev = op2; op2->Next = op1; op1b->Next = op2b; op2b->Prev = op1b; j->OutPt1 = op1; j->OutPt2 = op1b; return true; } else { op1b = DupOutPt(op1, true); op2b = DupOutPt(op2, false); op1->Next = op2; op2->Prev = op1; op1b->Prev = op2b; op2b->Next = op1b; j->OutPt1 = op1; j->OutPt2 = op1b; return true; } } else if (isHorizontal) { //treat horizontal joins differently to non-horizontal joins since with //them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt //may be anywhere along the horizontal edge. op1b = op1; while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2) op1 = op1->Prev; while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2) op1b = op1b->Next; if (op1b->Next == op1 || op1b->Next == op2) return false; //a flat 'polygon' op2b = op2; while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b) op2 = op2->Prev; while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1) op2b = op2b->Next; if (op2b->Next == op2 || op2b->Next == op1) return false; //a flat 'polygon' cInt Left, Right; //Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right)) return false; //DiscardLeftSide: when overlapping edges are joined, a spike will created //which needs to be cleaned up. However, we don't want Op1 or Op2 caught up //on the discard Side as either may still be needed for other joins ... IntPoint Pt; bool DiscardLeftSide; if (op1->Pt.X >= Left && op1->Pt.X <= Right) { Pt = op1->Pt; DiscardLeftSide = (op1->Pt.X > op1b->Pt.X); } else if (op2->Pt.X >= Left&& op2->Pt.X <= Right) { Pt = op2->Pt; DiscardLeftSide = (op2->Pt.X > op2b->Pt.X); } else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right) { Pt = op1b->Pt; DiscardLeftSide = op1b->Pt.X > op1->Pt.X; } else { Pt = op2b->Pt; DiscardLeftSide = (op2b->Pt.X > op2->Pt.X); } j->OutPt1 = op1; j->OutPt2 = op2; return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide); } else { //nb: For non-horizontal joins ... // 1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y // 2. Jr.OutPt1.Pt > Jr.OffPt.Y //make sure the polygons are correctly oriented ... op1b = op1->Next; while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next; bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) || !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)); if (Reverse1) { op1b = op1->Prev; while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev; if ((op1b->Pt.Y > op1->Pt.Y) || !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false; }; op2b = op2->Next; while ((op2b->Pt == op2->Pt) && (op2b != op2))op2b = op2b->Next; bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) || !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)); if (Reverse2) { op2b = op2->Prev; while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev; if ((op2b->Pt.Y > op2->Pt.Y) || !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false; } if ((op1b == op1) || (op2b == op2) || (op1b == op2b) || ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false; if (Reverse1) { op1b = DupOutPt(op1, false); op2b = DupOutPt(op2, true); op1->Prev = op2; op2->Next = op1; op1b->Next = op2b; op2b->Prev = op1b; j->OutPt1 = op1; j->OutPt2 = op1b; return true; } else { op1b = DupOutPt(op1, true); op2b = DupOutPt(op2, false); op1->Next = op2; op2->Prev = op1; op1b->Prev = op2b; op2b->Next = op1b; j->OutPt1 = op1; j->OutPt2 = op1b; return true; } } } //---------------------------------------------------------------------- static OutRec* ParseFirstLeft(OutRec* FirstLeft) { while (FirstLeft && !FirstLeft->Pts) FirstLeft = FirstLeft->FirstLeft; return FirstLeft; } //------------------------------------------------------------------------------ void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec) { //tests if NewOutRec contains the polygon before reassigning FirstLeft for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) { OutRec* outRec = m_PolyOuts[i]; OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft); if (outRec->Pts && firstLeft == OldOutRec) { if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts)) outRec->FirstLeft = NewOutRec; } } } //---------------------------------------------------------------------- void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec) { //A polygon has split into two such that one is now the inner of the other. //It's possible that these polygons now wrap around other polygons, so check //every polygon that's also contained by OuterOutRec's FirstLeft container //(including 0) to see if they've become inner to the new inner polygon ... OutRec* orfl = OuterOutRec->FirstLeft; for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) { OutRec* outRec = m_PolyOuts[i]; if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec) continue; OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft); if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec) continue; if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts)) outRec->FirstLeft = InnerOutRec; else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts)) outRec->FirstLeft = OuterOutRec; else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec) outRec->FirstLeft = orfl; } } //---------------------------------------------------------------------- void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec) { //reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i) { OutRec* outRec = m_PolyOuts[i]; OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft); if (outRec->Pts && firstLeft == OldOutRec) outRec->FirstLeft = NewOutRec; } } //---------------------------------------------------------------------- void Clipper::JoinCommonEdges() { for (JoinList::size_type i = 0; i < m_Joins.size(); i++) { Join* join = m_Joins[i]; OutRec *outRec1 = GetOutRec(join->OutPt1->Idx); OutRec *outRec2 = GetOutRec(join->OutPt2->Idx); if (!outRec1->Pts || !outRec2->Pts) continue; if (outRec1->IsOpen || outRec2->IsOpen) continue; //get the polygon fragment with the correct hole state (FirstLeft) //before calling JoinPoints() ... OutRec *holeStateRec; if (outRec1 == outRec2) holeStateRec = outRec1; else if (OutRec1RightOfOutRec2(outRec1, outRec2)) holeStateRec = outRec2; else if (OutRec1RightOfOutRec2(outRec2, outRec1)) holeStateRec = outRec1; else holeStateRec = GetLowermostRec(outRec1, outRec2); if (!JoinPoints(join, outRec1, outRec2)) continue; if (outRec1 == outRec2) { //instead of joining two polygons, we've just created a new one by //splitting one polygon into two. outRec1->Pts = join->OutPt1; outRec1->BottomPt = 0; outRec2 = CreateOutRec(); outRec2->Pts = join->OutPt2; //update all OutRec2.Pts Idx's ... UpdateOutPtIdxs(*outRec2); if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts)) { //outRec1 contains outRec2 ... outRec2->IsHole = !outRec1->IsHole; outRec2->FirstLeft = outRec1; if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1); if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0)) ReversePolyPtLinks(outRec2->Pts); } else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts)) { //outRec2 contains outRec1 ... outRec2->IsHole = outRec1->IsHole; outRec1->IsHole = !outRec2->IsHole; outRec2->FirstLeft = outRec1->FirstLeft; outRec1->FirstLeft = outRec2; if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2); if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0)) ReversePolyPtLinks(outRec1->Pts); } else { //the 2 polygons are completely separate ... outRec2->IsHole = outRec1->IsHole; outRec2->FirstLeft = outRec1->FirstLeft; //fixup FirstLeft pointers that may need reassigning to OutRec2 if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2); } } else { //joined 2 polygons together ... outRec2->Pts = 0; outRec2->BottomPt = 0; outRec2->Idx = outRec1->Idx; outRec1->IsHole = holeStateRec->IsHole; if (holeStateRec == outRec2) outRec1->FirstLeft = outRec2->FirstLeft; outRec2->FirstLeft = outRec1; if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1); } } } //------------------------------------------------------------------------------ // ClipperOffset support functions ... //------------------------------------------------------------------------------ DoublePoint GetUnitNormal(const IntPoint &pt1, const IntPoint &pt2) { if(pt2.X == pt1.X && pt2.Y == pt1.Y) return DoublePoint(0, 0); double Dx = (double)(pt2.X - pt1.X); double dy = (double)(pt2.Y - pt1.Y); double f = 1 *1.0/ std::sqrt( Dx*Dx + dy*dy ); Dx *= f; dy *= f; return DoublePoint(dy, -Dx); } //------------------------------------------------------------------------------ // ClipperOffset class //------------------------------------------------------------------------------ ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance) { this->MiterLimit = miterLimit; this->ArcTolerance = arcTolerance; m_lowest.X = -1; } //------------------------------------------------------------------------------ ClipperOffset::~ClipperOffset() { Clear(); } //------------------------------------------------------------------------------ void ClipperOffset::Clear() { for (int i = 0; i < m_polyNodes.ChildCount(); ++i) delete m_polyNodes.Childs[i]; m_polyNodes.Childs.clear(); m_lowest.X = -1; } //------------------------------------------------------------------------------ void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType) { int highI = (int)path.size() - 1; if (highI < 0) return; PolyNode* newNode = new PolyNode(); newNode->m_jointype = joinType; newNode->m_endtype = endType; //strip duplicate points from path and also get index to the lowest point ... if (endType == etClosedLine || endType == etClosedPolygon) while (highI > 0 && path[0] == path[highI]) highI--; newNode->Contour.reserve(highI + 1); newNode->Contour.push_back(path[0]); int j = 0, k = 0; for (int i = 1; i <= highI; i++) if (newNode->Contour[j] != path[i]) { j++; newNode->Contour.push_back(path[i]); if (path[i].Y > newNode->Contour[k].Y || (path[i].Y == newNode->Contour[k].Y && path[i].X < newNode->Contour[k].X)) k = j; } if (endType == etClosedPolygon && j < 2) { delete newNode; return; } m_polyNodes.AddChild(*newNode); //if this path's lowest pt is lower than all the others then update m_lowest if (endType != etClosedPolygon) return; if (m_lowest.X < 0) m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k); else { IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y]; if (newNode->Contour[k].Y > ip.Y || (newNode->Contour[k].Y == ip.Y && newNode->Contour[k].X < ip.X)) m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k); } } //------------------------------------------------------------------------------ void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType) { for (Paths::size_type i = 0; i < paths.size(); ++i) AddPath(paths[i], joinType, endType); } //------------------------------------------------------------------------------ void ClipperOffset::FixOrientations() { //fixup orientations of all closed paths if the orientation of the //closed path with the lowermost vertex is wrong ... if (m_lowest.X >= 0 && !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour)) { for (int i = 0; i < m_polyNodes.ChildCount(); ++i) { PolyNode& node = *m_polyNodes.Childs[i]; if (node.m_endtype == etClosedPolygon || (node.m_endtype == etClosedLine && Orientation(node.Contour))) ReversePath(node.Contour); } } else { for (int i = 0; i < m_polyNodes.ChildCount(); ++i) { PolyNode& node = *m_polyNodes.Childs[i]; if (node.m_endtype == etClosedLine && !Orientation(node.Contour)) ReversePath(node.Contour); } } } //------------------------------------------------------------------------------ void ClipperOffset::Execute(Paths& solution, double delta) { solution.clear(); FixOrientations(); DoOffset(delta); //now clean up 'corners' ... Clipper clpr; clpr.AddPaths(m_destPolys, ptSubject, true); if (delta > 0) { clpr.Execute(ctUnion, solution, pftPositive, pftPositive); } else { IntRect r = clpr.GetBounds(); Path outer(4); outer[0] = IntPoint(r.left - 10, r.bottom + 10); outer[1] = IntPoint(r.right + 10, r.bottom + 10); outer[2] = IntPoint(r.right + 10, r.top - 10); outer[3] = IntPoint(r.left - 10, r.top - 10); clpr.AddPath(outer, ptSubject, true); clpr.ReverseSolution(true); clpr.Execute(ctUnion, solution, pftNegative, pftNegative); if (solution.size() > 0) solution.erase(solution.begin()); } } //------------------------------------------------------------------------------ void ClipperOffset::Execute(PolyTree& solution, double delta) { solution.Clear(); FixOrientations(); DoOffset(delta); //now clean up 'corners' ... Clipper clpr; clpr.AddPaths(m_destPolys, ptSubject, true); if (delta > 0) { clpr.Execute(ctUnion, solution, pftPositive, pftPositive); } else { IntRect r = clpr.GetBounds(); Path outer(4); outer[0] = IntPoint(r.left - 10, r.bottom + 10); outer[1] = IntPoint(r.right + 10, r.bottom + 10); outer[2] = IntPoint(r.right + 10, r.top - 10); outer[3] = IntPoint(r.left - 10, r.top - 10); clpr.AddPath(outer, ptSubject, true); clpr.ReverseSolution(true); clpr.Execute(ctUnion, solution, pftNegative, pftNegative); //remove the outer PolyNode rectangle ... if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0) { PolyNode* outerNode = solution.Childs[0]; solution.Childs.reserve(outerNode->ChildCount()); solution.Childs[0] = outerNode->Childs[0]; solution.Childs[0]->Parent = outerNode->Parent; for (int i = 1; i < outerNode->ChildCount(); ++i) solution.AddChild(*outerNode->Childs[i]); } else solution.Clear(); } } //------------------------------------------------------------------------------ void ClipperOffset::DoOffset(double delta) { m_destPolys.clear(); m_delta = delta; //if Zero offset, just copy any CLOSED polygons to m_p and return ... if (NEAR_ZERO(delta)) { m_destPolys.reserve(m_polyNodes.ChildCount()); for (int i = 0; i < m_polyNodes.ChildCount(); i++) { PolyNode& node = *m_polyNodes.Childs[i]; if (node.m_endtype == etClosedPolygon) m_destPolys.push_back(node.Contour); } return; } //see offset_triginometry3.svg in the documentation folder ... if (MiterLimit > 2) m_miterLim = 2/(MiterLimit * MiterLimit); else m_miterLim = 0.5; double y; if (ArcTolerance <= 0.0) y = def_arc_tolerance; else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance) y = std::fabs(delta) * def_arc_tolerance; else y = ArcTolerance; //see offset_triginometry2.svg in the documentation folder ... double steps = pi / std::acos(1 - y / std::fabs(delta)); if (steps > std::fabs(delta) * pi) steps = std::fabs(delta) * pi; //ie excessive precision check m_sin = std::sin(two_pi / steps); m_cos = std::cos(two_pi / steps); m_StepsPerRad = steps / two_pi; if (delta < 0.0) m_sin = -m_sin; m_destPolys.reserve(m_polyNodes.ChildCount() * 2); for (int i = 0; i < m_polyNodes.ChildCount(); i++) { PolyNode& node = *m_polyNodes.Childs[i]; m_srcPoly = node.Contour; int len = (int)m_srcPoly.size(); if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon))) continue; m_destPoly.clear(); if (len == 1) { if (node.m_jointype == jtRound) { double X = 1.0, Y = 0.0; for (cInt j = 1; j <= steps; j++) { m_destPoly.push_back(IntPoint( Round(m_srcPoly[0].X + X * delta), Round(m_srcPoly[0].Y + Y * delta))); double X2 = X; X = X * m_cos - m_sin * Y; Y = X2 * m_sin + Y * m_cos; } } else { double X = -1.0, Y = -1.0; for (int j = 0; j < 4; ++j) { m_destPoly.push_back(IntPoint( Round(m_srcPoly[0].X + X * delta), Round(m_srcPoly[0].Y + Y * delta))); if (X < 0) X = 1; else if (Y < 0) Y = 1; else X = -1; } } m_destPolys.push_back(m_destPoly); continue; } //build m_normals ... m_normals.clear(); m_normals.reserve(len); for (int j = 0; j < len - 1; ++j) m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1])); if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon) m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0])); else m_normals.push_back(DoublePoint(m_normals[len - 2])); if (node.m_endtype == etClosedPolygon) { int k = len - 1; for (int j = 0; j < len; ++j) OffsetPoint(j, k, node.m_jointype); m_destPolys.push_back(m_destPoly); } else if (node.m_endtype == etClosedLine) { int k = len - 1; for (int j = 0; j < len; ++j) OffsetPoint(j, k, node.m_jointype); m_destPolys.push_back(m_destPoly); m_destPoly.clear(); //re-build m_normals ... DoublePoint n = m_normals[len -1]; for (int j = len - 1; j > 0; j--) m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y); m_normals[0] = DoublePoint(-n.X, -n.Y); k = 0; for (int j = len - 1; j >= 0; j--) OffsetPoint(j, k, node.m_jointype); m_destPolys.push_back(m_destPoly); } else { int k = 0; for (int j = 1; j < len - 1; ++j) OffsetPoint(j, k, node.m_jointype); IntPoint pt1; if (node.m_endtype == etOpenButt) { int j = len - 1; pt1 = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X * delta), (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta)); m_destPoly.push_back(pt1); pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X * delta), (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta)); m_destPoly.push_back(pt1); } else { int j = len - 1; k = len - 2; m_sinA = 0; m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y); if (node.m_endtype == etOpenSquare) DoSquare(j, k); else DoRound(j, k); } //re-build m_normals ... for (int j = len - 1; j > 0; j--) m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y); m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y); k = len - 1; for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype); if (node.m_endtype == etOpenButt) { pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta), (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta)); m_destPoly.push_back(pt1); pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta), (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta)); m_destPoly.push_back(pt1); } else { k = 1; m_sinA = 0; if (node.m_endtype == etOpenSquare) DoSquare(0, 1); else DoRound(0, 1); } m_destPolys.push_back(m_destPoly); } } } //------------------------------------------------------------------------------ void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype) { //cross product ... m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y); if (std::fabs(m_sinA * m_delta) < 1.0) { //dot product ... double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y ); if (cosA > 0) // angle => 0 degrees { m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta), Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta))); return; } //else angle => 180 degrees } else if (m_sinA > 1.0) m_sinA = 1.0; else if (m_sinA < -1.0) m_sinA = -1.0; if (m_sinA * m_delta < 0) { m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta), Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta))); m_destPoly.push_back(m_srcPoly[j]); m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta), Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta))); } else switch (jointype) { case jtMiter: { double r = 1 + (m_normals[j].X * m_normals[k].X + m_normals[j].Y * m_normals[k].Y); if (r >= m_miterLim) DoMiter(j, k, r); else DoSquare(j, k); break; } case jtSquare: DoSquare(j, k); break; case jtRound: DoRound(j, k); break; } k = j; } //------------------------------------------------------------------------------ void ClipperOffset::DoSquare(int j, int k) { double dx = std::tan(std::atan2(m_sinA, m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) / 4); m_destPoly.push_back(IntPoint( Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)), Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx)))); m_destPoly.push_back(IntPoint( Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)), Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx)))); } //------------------------------------------------------------------------------ void ClipperOffset::DoMiter(int j, int k, double r) { double q = m_delta / r; m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q), Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q))); } //------------------------------------------------------------------------------ void ClipperOffset::DoRound(int j, int k) { double a = std::atan2(m_sinA, m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y); int steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1); double X = m_normals[k].X, Y = m_normals[k].Y, X2; for (int i = 0; i < steps; ++i) { m_destPoly.push_back(IntPoint( Round(m_srcPoly[j].X + X * m_delta), Round(m_srcPoly[j].Y + Y * m_delta))); X2 = X; X = X * m_cos - m_sin * Y; Y = X2 * m_sin + Y * m_cos; } m_destPoly.push_back(IntPoint( Round(m_srcPoly[j].X + m_normals[j].X * m_delta), Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta))); } //------------------------------------------------------------------------------ // Miscellaneous public functions //------------------------------------------------------------------------------ void Clipper::DoSimplePolygons() { PolyOutList::size_type i = 0; while (i < m_PolyOuts.size()) { OutRec* outrec = m_PolyOuts[i++]; OutPt* op = outrec->Pts; if (!op || outrec->IsOpen) continue; do //for each Pt in Polygon until duplicate found do ... { OutPt* op2 = op->Next; while (op2 != outrec->Pts) { if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op) { //split the polygon into two ... OutPt* op3 = op->Prev; OutPt* op4 = op2->Prev; op->Prev = op4; op4->Next = op; op2->Prev = op3; op3->Next = op2; outrec->Pts = op; OutRec* outrec2 = CreateOutRec(); outrec2->Pts = op2; UpdateOutPtIdxs(*outrec2); if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts)) { //OutRec2 is contained by OutRec1 ... outrec2->IsHole = !outrec->IsHole; outrec2->FirstLeft = outrec; if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec); } else if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts)) { //OutRec1 is contained by OutRec2 ... outrec2->IsHole = outrec->IsHole; outrec->IsHole = !outrec2->IsHole; outrec2->FirstLeft = outrec->FirstLeft; outrec->FirstLeft = outrec2; if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2); } else { //the 2 polygons are separate ... outrec2->IsHole = outrec->IsHole; outrec2->FirstLeft = outrec->FirstLeft; if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2); } op2 = op; //ie get ready for the Next iteration } op2 = op2->Next; } op = op->Next; } while (op != outrec->Pts); } } //------------------------------------------------------------------------------ void ReversePath(Path& p) { std::reverse(p.begin(), p.end()); } //------------------------------------------------------------------------------ void ReversePaths(Paths& p) { for (Paths::size_type i = 0; i < p.size(); ++i) ReversePath(p[i]); } //------------------------------------------------------------------------------ void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType) { Clipper c; c.StrictlySimple(true); c.AddPath(in_poly, ptSubject, true); c.Execute(ctUnion, out_polys, fillType, fillType); } //------------------------------------------------------------------------------ void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType) { Clipper c; c.StrictlySimple(true); c.AddPaths(in_polys, ptSubject, true); c.Execute(ctUnion, out_polys, fillType, fillType); } //------------------------------------------------------------------------------ void SimplifyPolygons(Paths &polys, PolyFillType fillType) { SimplifyPolygons(polys, polys, fillType); } //------------------------------------------------------------------------------ inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2) { double Dx = ((double)pt1.X - pt2.X); double dy = ((double)pt1.Y - pt2.Y); return (Dx*Dx + dy*dy); } //------------------------------------------------------------------------------ double DistanceFromLineSqrd( const IntPoint& pt, const IntPoint& ln1, const IntPoint& ln2) { //The equation of a line in general form (Ax + By + C = 0) //given 2 points (x�,y�) & (x�,y�) is ... //(y� - y�)x + (x� - x�)y + (y� - y�)x� - (x� - x�)y� = 0 //A = (y� - y�); B = (x� - x�); C = (y� - y�)x� - (x� - x�)y� //perpendicular distance of point (x�,y�) = (Ax� + By� + C)/Sqrt(A� + B�) //see http://en.wikipedia.org/wiki/Perpendicular_distance double A = double(ln1.Y - ln2.Y); double B = double(ln2.X - ln1.X); double C = A * ln1.X + B * ln1.Y; C = A * pt.X + B * pt.Y - C; return (C * C) / (A * A + B * B); } //--------------------------------------------------------------------------- bool SlopesNearCollinear(const IntPoint& pt1, const IntPoint& pt2, const IntPoint& pt3, double distSqrd) { //this function is more accurate when the point that's geometrically //between the other 2 points is the one that's tested for distance. //ie makes it more likely to pick up 'spikes' ... if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y)) { if ((pt1.X > pt2.X) == (pt1.X < pt3.X)) return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd; else if ((pt2.X > pt1.X) == (pt2.X < pt3.X)) return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd; else return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd; } else { if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y)) return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd; else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y)) return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd; else return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd; } } //------------------------------------------------------------------------------ bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd) { double Dx = (double)pt1.X - pt2.X; double dy = (double)pt1.Y - pt2.Y; return ((Dx * Dx) + (dy * dy) <= distSqrd); } //------------------------------------------------------------------------------ OutPt* ExcludeOp(OutPt* op) { OutPt* result = op->Prev; result->Next = op->Next; op->Next->Prev = result; result->Idx = 0; return result; } //------------------------------------------------------------------------------ void CleanPolygon(const Path& in_poly, Path& out_poly, double distance) { //distance = proximity in units/pixels below which vertices //will be stripped. Default ~= sqrt(2). size_t size = in_poly.size(); if (size == 0) { out_poly.clear(); return; } OutPt* outPts = new OutPt[size]; for (size_t i = 0; i < size; ++i) { outPts[i].Pt = in_poly[i]; outPts[i].Next = &outPts[(i + 1) % size]; outPts[i].Next->Prev = &outPts[i]; outPts[i].Idx = 0; } double distSqrd = distance * distance; OutPt* op = &outPts[0]; while (op->Idx == 0 && op->Next != op->Prev) { if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd)) { op = ExcludeOp(op); size--; } else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd)) { ExcludeOp(op->Next); op = ExcludeOp(op); size -= 2; } else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd)) { op = ExcludeOp(op); size--; } else { op->Idx = 1; op = op->Next; } } if (size < 3) size = 0; out_poly.resize(size); for (size_t i = 0; i < size; ++i) { out_poly[i] = op->Pt; op = op->Next; } delete [] outPts; } //------------------------------------------------------------------------------ void CleanPolygon(Path& poly, double distance) { CleanPolygon(poly, poly, distance); } //------------------------------------------------------------------------------ void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance) { out_polys.resize(in_polys.size()); for (Paths::size_type i = 0; i < in_polys.size(); ++i) CleanPolygon(in_polys[i], out_polys[i], distance); } //------------------------------------------------------------------------------ void CleanPolygons(Paths& polys, double distance) { CleanPolygons(polys, polys, distance); } //------------------------------------------------------------------------------ void Minkowski(const Path& poly, const Path& path, Paths& solution, bool isSum, bool isClosed) { int delta = (isClosed ? 1 : 0); size_t polyCnt = poly.size(); size_t pathCnt = path.size(); Paths pp; pp.reserve(pathCnt); if (isSum) for (size_t i = 0; i < pathCnt; ++i) { Path p; p.reserve(polyCnt); for (size_t j = 0; j < poly.size(); ++j) p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y)); pp.push_back(p); } else for (size_t i = 0; i < pathCnt; ++i) { Path p; p.reserve(polyCnt); for (size_t j = 0; j < poly.size(); ++j) p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y)); pp.push_back(p); } solution.clear(); solution.reserve((pathCnt + delta) * (polyCnt + 1)); for (size_t i = 0; i < pathCnt - 1 + delta; ++i) for (size_t j = 0; j < polyCnt; ++j) { Path quad; quad.reserve(4); quad.push_back(pp[i % pathCnt][j % polyCnt]); quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]); quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]); quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]); if (!Orientation(quad)) ReversePath(quad); solution.push_back(quad); } } //------------------------------------------------------------------------------ void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed) { Minkowski(pattern, path, solution, true, pathIsClosed); Clipper c; c.AddPaths(solution, ptSubject, true); c.Execute(ctUnion, solution, pftNonZero, pftNonZero); } //------------------------------------------------------------------------------ void TranslatePath(const Path& input, Path& output, const IntPoint delta) { //precondition: input != output output.resize(input.size()); for (size_t i = 0; i < input.size(); ++i) output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y); } //------------------------------------------------------------------------------ void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed) { Clipper c; for (size_t i = 0; i < paths.size(); ++i) { Paths tmp; Minkowski(pattern, paths[i], tmp, true, pathIsClosed); c.AddPaths(tmp, ptSubject, true); if (pathIsClosed) { Path tmp2; TranslatePath(paths[i], tmp2, pattern[0]); c.AddPath(tmp2, ptClip, true); } } c.Execute(ctUnion, solution, pftNonZero, pftNonZero); } //------------------------------------------------------------------------------ void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution) { Minkowski(poly1, poly2, solution, false, true); Clipper c; c.AddPaths(solution, ptSubject, true); c.Execute(ctUnion, solution, pftNonZero, pftNonZero); } //------------------------------------------------------------------------------ enum NodeType {ntAny, ntOpen, ntClosed}; void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths) { bool match = true; if (nodetype == ntClosed) match = !polynode.IsOpen(); else if (nodetype == ntOpen) return; if (!polynode.Contour.empty() && match) paths.push_back(polynode.Contour); for (int i = 0; i < polynode.ChildCount(); ++i) AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths); } //------------------------------------------------------------------------------ void PolyTreeToPaths(const PolyTree& polytree, Paths& paths) { paths.resize(0); paths.reserve(polytree.Total()); AddPolyNodeToPaths(polytree, ntAny, paths); } //------------------------------------------------------------------------------ void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths) { paths.resize(0); paths.reserve(polytree.Total()); AddPolyNodeToPaths(polytree, ntClosed, paths); } //------------------------------------------------------------------------------ void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths) { paths.resize(0); paths.reserve(polytree.Total()); //Open paths are top level only, so ... for (int i = 0; i < polytree.ChildCount(); ++i) if (polytree.Childs[i]->IsOpen()) paths.push_back(polytree.Childs[i]->Contour); } //------------------------------------------------------------------------------ std::ostream& operator <<(std::ostream &s, const IntPoint &p) { s << "(" << p.X << "," << p.Y << ")"; return s; } //------------------------------------------------------------------------------ std::ostream& operator <<(std::ostream &s, const Path &p) { if (p.empty()) return s; Path::size_type last = p.size() -1; for (Path::size_type i = 0; i < last; i++) s << "(" << p[i].X << "," << p[i].Y << "), "; s << "(" << p[last].X << "," << p[last].Y << ")\n"; return s; } //------------------------------------------------------------------------------ std::ostream& operator <<(std::ostream &s, const Paths &p) { for (Paths::size_type i = 0; i < p.size(); i++) s << p[i]; s << "\n"; return s; } //------------------------------------------------------------------------------ } //ClipperLib namespace ================================================ FILE: dbnet/clipper/clipper.hpp ================================================ /******************************************************************************* * * * Author : Angus Johnson * * Version : 6.4.2 * * Date : 27 February 2017 * * Website : http://www.angusj.com * * Copyright : Angus Johnson 2010-2017 * * * * License: * * Use, modification & distribution is subject to Boost Software License Ver 1. * * http://www.boost.org/LICENSE_1_0.txt * * * * Attributions: * * The code in this library is an extension of Bala Vatti's clipping algorithm: * * "A generic solution to polygon clipping" * * Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63. * * http://portal.acm.org/citation.cfm?id=129906 * * * * Computer graphics and geometric modeling: implementation and algorithms * * By Max K. Agoston * * Springer; 1 edition (January 4, 2005) * * http://books.google.com/books?q=vatti+clipping+agoston * * * * See also: * * "Polygon Offsetting by Computing Winding Numbers" * * Paper no. DETC2005-85513 pp. 565-575 * * ASME 2005 International Design Engineering Technical Conferences * * and Computers and Information in Engineering Conference (IDETC/CIE2005) * * September 24-28, 2005 , Long Beach, California, USA * * http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf * * * *******************************************************************************/ #ifndef clipper_hpp #define clipper_hpp #define CLIPPER_VERSION "6.4.2" //use_int32: When enabled 32bit ints are used instead of 64bit ints. This //improve performance but coordinate values are limited to the range +/- 46340 //#define use_int32 //use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance. //#define use_xyz //use_lines: Enables line clipping. Adds a very minor cost to performance. #define use_lines //use_deprecated: Enables temporary support for the obsolete functions //#define use_deprecated #include #include #include #include #include #include #include #include #include namespace ClipperLib { enum ClipType { ctIntersection, ctUnion, ctDifference, ctXor }; enum PolyType { ptSubject, ptClip }; //By far the most widely used winding rules for polygon filling are //EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32) //Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL) //see http://glprogramming.com/red/chapter11.html enum PolyFillType { pftEvenOdd, pftNonZero, pftPositive, pftNegative }; #ifdef use_int32 typedef int cInt; static cInt const loRange = 0x7FFF; static cInt const hiRange = 0x7FFF; #else typedef signed long long cInt; static cInt const loRange = 0x3FFFFFFF; static cInt const hiRange = 0x3FFFFFFFFFFFFFFFLL; typedef signed long long long64; //used by Int128 class typedef unsigned long long ulong64; #endif struct IntPoint { cInt X; cInt Y; #ifdef use_xyz cInt Z; IntPoint(cInt x = 0, cInt y = 0, cInt z = 0): X(x), Y(y), Z(z) {}; #else IntPoint(cInt x = 0, cInt y = 0): X(x), Y(y) {}; #endif friend inline bool operator== (const IntPoint& a, const IntPoint& b) { return a.X == b.X && a.Y == b.Y; } friend inline bool operator!= (const IntPoint& a, const IntPoint& b) { return a.X != b.X || a.Y != b.Y; } }; //------------------------------------------------------------------------------ typedef std::vector< IntPoint > Path; typedef std::vector< Path > Paths; inline Path& operator <<(Path& poly, const IntPoint& p) {poly.push_back(p); return poly;} inline Paths& operator <<(Paths& polys, const Path& p) {polys.push_back(p); return polys;} std::ostream& operator <<(std::ostream &s, const IntPoint &p); std::ostream& operator <<(std::ostream &s, const Path &p); std::ostream& operator <<(std::ostream &s, const Paths &p); struct DoublePoint { double X; double Y; DoublePoint(double x = 0, double y = 0) : X(x), Y(y) {} DoublePoint(IntPoint ip) : X((double)ip.X), Y((double)ip.Y) {} }; //------------------------------------------------------------------------------ #ifdef use_xyz typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt); #endif enum InitOptions {ioReverseSolution = 1, ioStrictlySimple = 2, ioPreserveCollinear = 4}; enum JoinType {jtSquare, jtRound, jtMiter}; enum EndType {etClosedPolygon, etClosedLine, etOpenButt, etOpenSquare, etOpenRound}; class PolyNode; typedef std::vector< PolyNode* > PolyNodes; class PolyNode { public: PolyNode(); virtual ~PolyNode(){}; Path Contour; PolyNodes Childs; PolyNode* Parent; PolyNode* GetNext() const; bool IsHole() const; bool IsOpen() const; int ChildCount() const; private: //PolyNode& operator =(PolyNode& other); unsigned Index; //node index in Parent.Childs bool m_IsOpen; JoinType m_jointype; EndType m_endtype; PolyNode* GetNextSiblingUp() const; void AddChild(PolyNode& child); friend class Clipper; //to access Index friend class ClipperOffset; }; class PolyTree: public PolyNode { public: ~PolyTree(){ Clear(); }; PolyNode* GetFirst() const; void Clear(); int Total() const; private: //PolyTree& operator =(PolyTree& other); PolyNodes AllNodes; friend class Clipper; //to access AllNodes }; bool Orientation(const Path &poly); double Area(const Path &poly); int PointInPolygon(const IntPoint &pt, const Path &path); void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType = pftEvenOdd); void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType = pftEvenOdd); void SimplifyPolygons(Paths &polys, PolyFillType fillType = pftEvenOdd); void CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415); void CleanPolygon(Path& poly, double distance = 1.415); void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415); void CleanPolygons(Paths& polys, double distance = 1.415); void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed); void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed); void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution); void PolyTreeToPaths(const PolyTree& polytree, Paths& paths); void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths); void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths); void ReversePath(Path& p); void ReversePaths(Paths& p); struct IntRect { cInt left; cInt top; cInt right; cInt bottom; }; //enums that are used internally ... enum EdgeSide { esLeft = 1, esRight = 2}; //forward declarations (for stuff used internally) ... struct TEdge; struct IntersectNode; struct LocalMinimum; struct OutPt; struct OutRec; struct Join; typedef std::vector < OutRec* > PolyOutList; typedef std::vector < TEdge* > EdgeList; typedef std::vector < Join* > JoinList; typedef std::vector < IntersectNode* > IntersectList; //------------------------------------------------------------------------------ //ClipperBase is the ancestor to the Clipper class. It should not be //instantiated directly. This class simply abstracts the conversion of sets of //polygon coordinates into edge objects that are stored in a LocalMinima list. class ClipperBase { public: ClipperBase(); virtual ~ClipperBase(); virtual bool AddPath(const Path &pg, PolyType PolyTyp, bool Closed); bool AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed); virtual void Clear(); IntRect GetBounds(); bool PreserveCollinear() {return m_PreserveCollinear;}; void PreserveCollinear(bool value) {m_PreserveCollinear = value;}; protected: void DisposeLocalMinimaList(); TEdge* AddBoundsToLML(TEdge *e, bool IsClosed); virtual void Reset(); TEdge* ProcessBound(TEdge* E, bool IsClockwise); void InsertScanbeam(const cInt Y); bool PopScanbeam(cInt &Y); bool LocalMinimaPending(); bool PopLocalMinima(cInt Y, const LocalMinimum *&locMin); OutRec* CreateOutRec(); void DisposeAllOutRecs(); void DisposeOutRec(PolyOutList::size_type index); void SwapPositionsInAEL(TEdge *edge1, TEdge *edge2); void DeleteFromAEL(TEdge *e); void UpdateEdgeIntoAEL(TEdge *&e); typedef std::vector MinimaList; MinimaList::iterator m_CurrentLM; MinimaList m_MinimaList; bool m_UseFullRange; EdgeList m_edges; bool m_PreserveCollinear; bool m_HasOpenPaths; PolyOutList m_PolyOuts; TEdge *m_ActiveEdges; typedef std::priority_queue ScanbeamList; ScanbeamList m_Scanbeam; }; //------------------------------------------------------------------------------ class Clipper : public virtual ClipperBase { public: Clipper(int initOptions = 0); bool Execute(ClipType clipType, Paths &solution, PolyFillType fillType = pftEvenOdd); bool Execute(ClipType clipType, Paths &solution, PolyFillType subjFillType, PolyFillType clipFillType); bool Execute(ClipType clipType, PolyTree &polytree, PolyFillType fillType = pftEvenOdd); bool Execute(ClipType clipType, PolyTree &polytree, PolyFillType subjFillType, PolyFillType clipFillType); bool ReverseSolution() { return m_ReverseOutput; }; void ReverseSolution(bool value) {m_ReverseOutput = value;}; bool StrictlySimple() {return m_StrictSimple;}; void StrictlySimple(bool value) {m_StrictSimple = value;}; //set the callback function for z value filling on intersections (otherwise Z is 0) #ifdef use_xyz void ZFillFunction(ZFillCallback zFillFunc); #endif protected: virtual bool ExecuteInternal(); private: JoinList m_Joins; JoinList m_GhostJoins; IntersectList m_IntersectList; ClipType m_ClipType; typedef std::list MaximaList; MaximaList m_Maxima; TEdge *m_SortedEdges; bool m_ExecuteLocked; PolyFillType m_ClipFillType; PolyFillType m_SubjFillType; bool m_ReverseOutput; bool m_UsingPolyTree; bool m_StrictSimple; #ifdef use_xyz ZFillCallback m_ZFill; //custom callback #endif void SetWindingCount(TEdge& edge); bool IsEvenOddFillType(const TEdge& edge) const; bool IsEvenOddAltFillType(const TEdge& edge) const; void InsertLocalMinimaIntoAEL(const cInt botY); void InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge); void AddEdgeToSEL(TEdge *edge); bool PopEdgeFromSEL(TEdge *&edge); void CopyAELToSEL(); void DeleteFromSEL(TEdge *e); void SwapPositionsInSEL(TEdge *edge1, TEdge *edge2); bool IsContributing(const TEdge& edge) const; bool IsTopHorz(const cInt XPos); void DoMaxima(TEdge *e); void ProcessHorizontals(); void ProcessHorizontal(TEdge *horzEdge); void AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &pt); OutPt* AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &pt); OutRec* GetOutRec(int idx); void AppendPolygon(TEdge *e1, TEdge *e2); void IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &pt); OutPt* AddOutPt(TEdge *e, const IntPoint &pt); OutPt* GetLastOutPt(TEdge *e); bool ProcessIntersections(const cInt topY); void BuildIntersectList(const cInt topY); void ProcessIntersectList(); void ProcessEdgesAtTopOfScanbeam(const cInt topY); void BuildResult(Paths& polys); void BuildResult2(PolyTree& polytree); void SetHoleState(TEdge *e, OutRec *outrec); void DisposeIntersectNodes(); bool FixupIntersectionOrder(); void FixupOutPolygon(OutRec &outrec); void FixupOutPolyline(OutRec &outrec); bool IsHole(TEdge *e); bool FindOwnerFromSplitRecs(OutRec &outRec, OutRec *&currOrfl); void FixHoleLinkage(OutRec &outrec); void AddJoin(OutPt *op1, OutPt *op2, const IntPoint offPt); void ClearJoins(); void ClearGhostJoins(); void AddGhostJoin(OutPt *op, const IntPoint offPt); bool JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2); void JoinCommonEdges(); void DoSimplePolygons(); void FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec); void FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec); void FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec); #ifdef use_xyz void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2); #endif }; //------------------------------------------------------------------------------ class ClipperOffset { public: ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25); ~ClipperOffset(); void AddPath(const Path& path, JoinType joinType, EndType endType); void AddPaths(const Paths& paths, JoinType joinType, EndType endType); void Execute(Paths& solution, double delta); void Execute(PolyTree& solution, double delta); void Clear(); double MiterLimit; double ArcTolerance; private: Paths m_destPolys; Path m_srcPoly; Path m_destPoly; std::vector m_normals; double m_delta, m_sinA, m_sin, m_cos; double m_miterLim, m_StepsPerRad; IntPoint m_lowest; PolyNode m_polyNodes; void FixOrientations(); void DoOffset(double delta); void OffsetPoint(int j, int& k, JoinType jointype); void DoSquare(int j, int k); void DoMiter(int j, int k, double r); void DoRound(int j, int k); }; //------------------------------------------------------------------------------ class clipperException : public std::exception { public: clipperException(const char* description): m_descr(description) {} virtual ~clipperException() throw() {} virtual const char* what() const throw() {return m_descr.c_str();} private: std::string m_descr; }; //------------------------------------------------------------------------------ } //ClipperLib namespace #endif //clipper_hpp ================================================ FILE: dbnet/common.hpp ================================================ #ifndef DBNET_COMMON_H_ #define DBNET_COMMON_H_ #include #include #include #include #include #include #include "dirent.h" #include "NvInfer.h" #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname, std::string bnname, bool bias = true) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int p = ksize / 2; IConvolutionLayer* conv1 = nullptr; if (bias) { conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".weight"], weightMap[lname + ".bias"]); } else { conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".weight"], emptywts); } assert(conv1); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); conv1->setNbGroups(g); //IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname.substr(0, lname.find_last_of(".")) + bnname, 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); lr->setAlpha(0.1); return lr; } IActivationLayer* basicBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3, 3 }, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IElementWiseLayer* ew1; if (inch != outch) { IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{ stride, stride }); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu2); return relu2; } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } #endif ================================================ FILE: dbnet/dbnet.cpp ================================================ #include #include #include "cuda_runtime_api.h" #include "logging.h" #include "common.hpp" #include #include "clipper.hpp" #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define EXPANDRATIO 1.5 #define BOX_MINI_SIZE 5 #define SCORE_THRESHOLD 0.3 #define BOX_THRESHOLD 0.7 static const int SHORT_INPUT = 640; static const int MAX_INPUT_SIZE = 1440; // 32x static const int MIN_INPUT_SIZE = 608; static const int OPT_INPUT_W = 1152; static const int OPT_INPUT_H = 640; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "out"; static Logger gLogger; cv::RotatedRect expandBox(cv::Point2f temp[], float ratio) { ClipperLib::Path path = { {ClipperLib::cInt(temp[0].x), ClipperLib::cInt(temp[0].y)}, {ClipperLib::cInt(temp[1].x), ClipperLib::cInt(temp[1].y)}, {ClipperLib::cInt(temp[2].x), ClipperLib::cInt(temp[2].y)}, {ClipperLib::cInt(temp[3].x), ClipperLib::cInt(temp[3].y)}}; double area = ClipperLib::Area(path); double distance; double length = 0.0; for (int i = 0; i < 4; i++) { length = length + sqrtf(powf((temp[i].x - temp[(i + 1) % 4].x), 2) + powf((temp[i].y - temp[(i + 1) % 4].y), 2)); } distance = area * ratio / length; ClipperLib::ClipperOffset offset; offset.AddPath(path, ClipperLib::JoinType::jtRound, ClipperLib::EndType::etClosedPolygon); ClipperLib::Paths paths; offset.Execute(paths, distance); std::vector contour; for (int i = 0; i < paths[0].size(); i++) { contour.emplace_back(paths[0][i].X, paths[0][i].Y); } offset.Clear(); return cv::minAreaRect(contour); } float paddimg(cv::Mat& In_Out_img, int shortsize = 960) { int w = In_Out_img.cols; int h = In_Out_img.rows; float scale = 1.f; if (w < h) { scale = (float)shortsize / w; h = scale * h; w = shortsize; } else { scale = (float)shortsize / h; w = scale * w; h = shortsize; } if (h % 32 != 0) { h = (h / 32 + 1) * 32; } if (w % 32 != 0) { w = (w / 32 + 1) * 32; } cv::resize(In_Out_img, In_Out_img, cv::Size(w, h)); return scale; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition* network = builder->createNetworkV2(explicitBatch); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{ 1, 3, -1, -1 }); assert(data); std::map weightMap = loadWeights("./DBNet.wts"); Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; /* ------ Resnet18 backbone------ */ // Add convolution layer with 6 outputs and a 5x5 filter. IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7, 7 }, weightMap["backbone.conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ 2, 2 }); conv1->setPaddingNd(DimsHW{ 3, 3 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "backbone.bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 }); assert(pool1); pool1->setStrideNd(DimsHW{ 2, 2 }); pool1->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "backbone.layer1.0."); IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "backbone.layer1.1."); // x2 IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 128, 2, "backbone.layer2.0."); IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 128, 128, 1, "backbone.layer2.1."); // x3 IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 256, 2, "backbone.layer3.0."); IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 256, 256, 1, "backbone.layer3.1."); //x4 IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 256, 512, 2, "backbone.layer4.0."); IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 512, 512, 1, "backbone.layer4.1."); //x5 /* ------- FPN neck ------- */ ILayer* p5 = convBnLeaky(network, weightMap, *relu9->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c5.conv", ".bn"); // k=1 s = 1 p = k/2=1/2=0 ILayer* c4_1 = convBnLeaky(network, weightMap, *relu7->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c4.conv", ".bn"); float *deval = reinterpret_cast(malloc(sizeof(float) * 64 * 2 * 2)); for (int i = 0; i < 64 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts1{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* p4_1 = network->addDeconvolutionNd(*p5->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts1, emptywts); p4_1->setStrideNd(DimsHW{ 2, 2 }); p4_1->setNbGroups(64); weightMap["deconv1"] = deconvwts1; IElementWiseLayer* p4_add = network->addElementWise(*p4_1->getOutput(0), *c4_1->getOutput(0), ElementWiseOperation::kSUM); ILayer* p4 = convBnLeaky(network, weightMap, *p4_add->getOutput(0), 64, 3, 1, 1, "neck.smooth_p4.conv", ".bn"); // smooth ILayer* c3_1 = convBnLeaky(network, weightMap, *relu5->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c3.conv", ".bn"); Weights deconvwts2{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* p3_1 = network->addDeconvolutionNd(*p4->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts2, emptywts); p3_1->setStrideNd(DimsHW{ 2, 2 }); p3_1->setNbGroups(64); IElementWiseLayer* p3_add = network->addElementWise(*p3_1->getOutput(0), *c3_1->getOutput(0), ElementWiseOperation::kSUM); ILayer* p3 = convBnLeaky(network, weightMap, *p3_add->getOutput(0), 64, 3, 1, 1, "neck.smooth_p3.conv", ".bn"); // smooth ILayer* c2_1 = convBnLeaky(network, weightMap, *relu3->getOutput(0), 64, 1, 1, 1, "neck.reduce_conv_c2.conv", ".bn"); Weights deconvwts3{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* p2_1 = network->addDeconvolutionNd(*p3->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts3, emptywts); p2_1->setStrideNd(DimsHW{ 2, 2 }); p2_1->setNbGroups(64); IElementWiseLayer* p2_add = network->addElementWise(*p2_1->getOutput(0), *c2_1->getOutput(0), ElementWiseOperation::kSUM); ILayer* p2 = convBnLeaky(network, weightMap, *p2_add->getOutput(0), 64, 3, 1, 1, "neck.smooth_p2.conv", ".bn"); // smooth Weights deconvwts4{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* p3_up_p2 = network->addDeconvolutionNd(*p3->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts4, emptywts); p3_up_p2->setStrideNd(DimsHW{ 2, 2 }); p3_up_p2->setNbGroups(64); float *deval2 = reinterpret_cast(malloc(sizeof(float) * 64 * 8 * 8)); for (int i = 0; i < 64 * 8 * 8; i++) { deval2[i] = 1.0; } Weights deconvwts5{ DataType::kFLOAT, deval2, 64 * 8 * 8 }; IDeconvolutionLayer* p4_up_p2 = network->addDeconvolutionNd(*p4->getOutput(0), 64, DimsHW{ 8, 8 }, deconvwts5, emptywts); p4_up_p2->setPaddingNd(DimsHW{ 2, 2 }); p4_up_p2->setStrideNd(DimsHW{ 4, 4 }); p4_up_p2->setNbGroups(64); weightMap["deconv2"] = deconvwts5; Weights deconvwts6{ DataType::kFLOAT, deval2, 64 * 8 * 8 }; IDeconvolutionLayer* p5_up_p2 = network->addDeconvolutionNd(*p5->getOutput(0), 64, DimsHW{ 8, 8 }, deconvwts6, emptywts); p5_up_p2->setStrideNd(DimsHW{ 8, 8 }); p5_up_p2->setNbGroups(64); // torch.cat([p2, p3, p4, p5], dim=1) ITensor* inputTensors[] = { p2->getOutput(0), p3_up_p2->getOutput(0), p4_up_p2->getOutput(0), p5_up_p2->getOutput(0) }; IConcatenationLayer* neck_cat = network->addConcatenation(inputTensors, 4); ILayer* neck_out = convBnLeaky(network, weightMap, *neck_cat->getOutput(0), 256, 3, 1, 1, "neck.conv.0", ".1"); // smooth assert(neck_out); ILayer* binarize1 = convBnLeaky(network, weightMap, *neck_out->getOutput(0), 64, 3, 1, 1, "head.binarize.0", ".1"); // Weights deconvwts7{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* binarizeup = network->addDeconvolutionNd(*binarize1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts7, emptywts); binarizeup->setStrideNd(DimsHW{ 2, 2 }); binarizeup->setNbGroups(64); IScaleLayer* binarizebn1 = addBatchNorm2d(network, weightMap, *binarizeup->getOutput(0), "head.binarize.4", 1e-5); IActivationLayer* binarizerelu1 = network->addActivation(*binarizebn1->getOutput(0), ActivationType::kRELU); assert(binarizerelu1); Weights deconvwts8{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* binarizeup2 = network->addDeconvolutionNd(*binarizerelu1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts8, emptywts); binarizeup2->setStrideNd(DimsHW{ 2, 2 }); binarizeup2->setNbGroups(64); IConvolutionLayer* binarize3 = network->addConvolutionNd(*binarizeup2->getOutput(0), 1, DimsHW{ 3, 3 }, weightMap["head.binarize.7.weight"], weightMap["head.binarize.7.bias"]); assert(binarize3); binarize3->setStrideNd(DimsHW{ 1, 1 }); binarize3->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* binarize4 = network->addActivation(*binarize3->getOutput(0), ActivationType::kSIGMOID); assert(binarize4); //threshold_maps = self.thresh(x) ILayer* thresh1 = convBnLeaky(network, weightMap, *neck_out->getOutput(0), 64, 3, 1, 1, "head.thresh.0", ".1", false); // Weights deconvwts9{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* threshup = network->addDeconvolutionNd(*thresh1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts9, emptywts); threshup->setStrideNd(DimsHW{ 2, 2 }); threshup->setNbGroups(64); IConvolutionLayer* thresh2 = network->addConvolutionNd(*threshup->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["head.thresh.3.1.weight"], weightMap["head.thresh.3.1.bias"]); assert(thresh2); thresh2->setStrideNd(DimsHW{ 1, 1 }); thresh2->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* threshbn1 = addBatchNorm2d(network, weightMap, *thresh2->getOutput(0), "head.thresh.4", 1e-5); IActivationLayer* threshrelu1 = network->addActivation(*threshbn1->getOutput(0), ActivationType::kRELU); assert(threshrelu1); Weights deconvwts10{ DataType::kFLOAT, deval, 64 * 2 * 2 }; IDeconvolutionLayer* threshup2 = network->addDeconvolutionNd(*threshrelu1->getOutput(0), 64, DimsHW{ 2, 2 }, deconvwts10, emptywts); threshup2->setStrideNd(DimsHW{ 2, 2 }); threshup2->setNbGroups(64); IConvolutionLayer* thresh3 = network->addConvolutionNd(*threshup2->getOutput(0), 1, DimsHW{ 3, 3 }, weightMap["head.thresh.6.1.weight"], weightMap["head.thresh.6.1.bias"]); assert(thresh3); thresh3->setStrideNd(DimsHW{ 1, 1 }); thresh3->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* thresh4 = network->addActivation(*thresh3->getOutput(0), ActivationType::kSIGMOID); assert(thresh4); ITensor* inputTensors2[] = { binarize4->getOutput(0), thresh4->getOutput(0) }; IConcatenationLayer* head_out = network->addConcatenation(inputTensors2, 2); // y = F.interpolate(y, size=(H, W)) head_out->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*head_out->getOutput(0)); IOptimizationProfile* profile = builder->createOptimizationProfile(); profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE)); profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W)); profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE)); config->addOptimizationProfile(profile); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); //ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int h_scale, int w_scale) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); context.setBindingDimensions(inputIndex, Dims4(1, 3, h_scale, w_scale)); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], 3 * h_scale * w_scale * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], 2 * h_scale * w_scale * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * h_scale * w_scale * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], h_scale * w_scale * 2 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } bool get_mini_boxes(cv::RotatedRect& rotated_rect, cv::Point2f rect[], int min_size) { cv::Point2f temp_rect[4]; rotated_rect.points(temp_rect); for (int i = 0; i < 4; i++) { for (int j = i + 1; j < 4; j++) { if (temp_rect[i].x > temp_rect[j].x) { cv::Point2f temp; temp = temp_rect[i]; temp_rect[i] = temp_rect[j]; temp_rect[j] = temp; } } } int index0 = 0; int index1 = 1; int index2 = 2; int index3 = 3; if (temp_rect[1].y > temp_rect[0].y) { index0 = 0; index3 = 1; } else { index0 = 1; index3 = 0; } if (temp_rect[3].y > temp_rect[2].y) { index1 = 2; index2 = 3; } else { index1 = 3; index2 = 2; } rect[0] = temp_rect[index0]; // Left top coordinate rect[1] = temp_rect[index1]; // Left bottom coordinate rect[2] = temp_rect[index2]; // Right bottom coordinate rect[3] = temp_rect[index3]; // Right top coordinate if (rotated_rect.size.width < min_size || rotated_rect.size.height < min_size) { return false; } else { return true; } } float get_box_score(float* map, cv::Point2f rect[], int width, int height, float threshold) { int xmin = width - 1; int ymin = height - 1; int xmax = 0; int ymax = 0; for (int j = 0; j < 4; j++) { if (rect[j].x < xmin) { xmin = rect[j].x; } if (rect[j].y < ymin) { ymin = rect[j].y; } if (rect[j].x > xmax) { xmax = rect[j].x; } if (rect[j].y > ymax) { ymax = rect[j].y; } } float sum = 0; int num = 0; for (int i = ymin; i <= ymax; i++) { for (int j = xmin; j <= xmax; j++) { if (map[i * width + j] > threshold) { sum = sum + map[i * width + j]; num++; } } } return sum / num; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{ nullptr }; size_t size{ 0 }; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{ nullptr }; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("DBNet.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("DBNet.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./debnet -s // serialize model to plan file" << std::endl; std::cerr << "./debnet -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } // prepare input data --------------------------- IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // icdar2015.yaml Hyperparameter std::vector mean_value{ 0.406, 0.456, 0.485 }; // BGR std::vector std_value{ 0.225, 0.224, 0.229 }; int fcount = 0; for (auto f : file_names) { fcount++; std::cout << fcount << " " << f << std::endl; cv::Mat pr_img = cv::imread(std::string(argv[2]) + "/" + f); cv::Mat src_img = pr_img.clone(); if (pr_img.empty()) continue; float scale = paddimg(pr_img, SHORT_INPUT); // resize the image std::cout << "letterbox shape: " << pr_img.cols << ", " << pr_img.rows << std::endl; if (pr_img.cols < MIN_INPUT_SIZE || pr_img.rows < MIN_INPUT_SIZE) continue; float* data = new float[3 * pr_img.rows * pr_img.cols]; auto start = std::chrono::system_clock::now(); int i = 0; for (int row = 0; row < pr_img.rows; ++row) { uchar* uc_pixel = pr_img.data + row * pr_img.step; for (int col = 0; col < pr_img.cols; ++col) { data[i] = (uc_pixel[2] / 255.0 - mean_value[2]) / std_value[2]; data[i + pr_img.rows * pr_img.cols] = (uc_pixel[1] / 255.0 - mean_value[1]) / std_value[1]; data[i + 2 * pr_img.rows * pr_img.cols] = (uc_pixel[0] / 255.0 - mean_value[0]) / std_value[0]; uc_pixel += 3; ++i; } } auto end = std::chrono::system_clock::now(); std::cout << "pre time:"<< std::chrono::duration_cast(end - start).count() << "ms" << std::endl; float* prob = new float[pr_img.rows *pr_img.cols * 2]; // Run inference start = std::chrono::system_clock::now(); doInference(*context, data, prob, pr_img.rows, pr_img.cols); end = std::chrono::system_clock::now(); std::cout << "detect time:"<< std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // prob shape is 2*640*640, get the first one cv::Mat map = cv::Mat::zeros(cv::Size(pr_img.cols, pr_img.rows), CV_8UC1); for (int h = 0; h < pr_img.rows; ++h) { uchar *ptr = map.ptr(h); for (int w = 0; w < pr_img.cols; ++w) { ptr[w] = (prob[h * pr_img.cols + w] > 0.3) ? 255 : 0; } } // Extracting minimum circumscribed rectangle std::vector> contours; std::vector hierarcy; cv::findContours(map, contours, hierarcy, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE); std::vector boundRect(contours.size()); std::vector box(contours.size()); cv::Point2f rect[4]; cv::Point2f order_rect[4]; for (int i = 0; i < contours.size(); i++) { cv::RotatedRect rotated_rect = cv::minAreaRect(cv::Mat(contours[i])); if (!get_mini_boxes(rotated_rect, rect, BOX_MINI_SIZE)) { std::cout << "box too small" << std::endl; continue; } // drop low score boxes float score = get_box_score(prob, rect, pr_img.cols, pr_img.rows, SCORE_THRESHOLD); if (score < BOX_THRESHOLD) { std::cout << "score too low = " << score << ", threshold = " << BOX_THRESHOLD << std::endl; continue; } // Scaling the predict boxes depend on EXPANDRATIO cv::RotatedRect expandbox = expandBox(rect, EXPANDRATIO); expandbox.points(rect); if (!get_mini_boxes(expandbox, rect, BOX_MINI_SIZE + 2)) { continue; } // Restore the coordinates to the original image for (int k = 0; k < 4; k++) { order_rect[k] = rect[k]; order_rect[k].x = int(order_rect[k].x / pr_img.cols * src_img.cols); order_rect[k].y = int(order_rect[k].y / pr_img.rows * src_img.rows); } cv::rectangle(src_img, cv::Point(order_rect[0].x,order_rect[0].y), cv::Point(order_rect[2].x,order_rect[2].y), cv::Scalar(0, 0, 255), 2, 8); //std::cout << "After LT = " << order_rect[0] << ", After RD = " << order_rect[2] << std::endl; } cv::imwrite("_" + f, src_img); std::cout << "write image done." << std::endl; //cv::waitKey(0); delete prob; delete data; } return 0; } ================================================ FILE: dbnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: dbnet/utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { class Profiler : public nvinfer1::IProfiler { public: void printLayerTimes(int itrationsTimes) { float totalTime = 0; for (size_t i = 0; i < mProfile.size(); i++) { printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); totalTime += mProfile[i].second; } printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); } private: typedef std::pair Record; std::vector mProfile; virtual void reportLayerTime(const char* layerName, float ms) { auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); if (record == mProfile.end()) mProfile.push_back(std::make_pair(layerName, ms)); else record->second += ms; } }; //Logger for TensorRT info/warning/errors class Logger : public nvinfer1::ILogger { public: Logger(): Logger(Severity::kWARNING) {} Logger(Severity severity): reportableSeverity(severity) {} void log(Severity severity, const char* msg) override { // suppress messages with severity enum value greater than the reportable if (severity > reportableSeverity) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cerr << "INFO: "; break; default: std::cerr << "UNKNOWN: "; break; } std::cerr << msg << std::endl; } Severity reportableSeverity{Severity::kWARNING}; }; template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } #endif ================================================ FILE: densenet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) # set the project name project(densenet) add_definitions(-std=c++11) # get main project dir to include common files get_filename_component(MAIN_DIR ../ ABSOLUTE) # When enabled the static version of the # CUDA runtime library will be used in CUDA_LIBRARIES option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) # specify the C++ standard set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_BUILD_TYPE Debug) # include # include and link cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # include and link tensorrt include_directories(/usr/include/x86_64-linux-gnu) link_directories(/usr/lib/x86_64-linux-gnu) # add the executable add_executable(densenet ${PROJECT_SOURCE_DIR}/densenet121.cpp) target_link_libraries(densenet nvinfer) target_link_libraries(densenet cudart) add_definitions(-O2 -pthread) ================================================ FILE: densenet/README.md ================================================ # Densenet121 The Pytorch implementation is [makaveli10/densenet](https://github.com/makaveli10/torchtrtz/tree/main/densenet). Model from torchvision. The tensorrt implemenation is taken from [makaveli10/cpptensorrtz](https://github.com/makaveli10/cpptensorrtz/). ## How to Run 1. generate densenet121.wts from pytorch ``` git clone https://github.com/wang-xinyu/tensorrtx.git git clone https://github.com/makaveli10/torchtrtz.git // go to torchtrtz/densenet // Enter these two commands to create densenet121.wts python models.py python gen_trtwts.py ``` 2. build densenet and run ``` // put densenet121.wts into tensorrtx/densenet // go to tensorrtx/densenet mkdir build cd build cmake .. make sudo ./densenet -s // serialize model to file i.e. 'densenet.engine' sudo ./densenet -d // deserialize model and run inference ``` 3. Verify output from [torch impl](https://github.com/makaveli10/torchtrtz/blob/main/densenet/README.md) TensorRT output[:5]: ``` [-0.587389, -0.329202, -1.83404, -1.89935, -0.928404] ``` ================================================ FILE: densenet/densenet121.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IConvolutionLayer* addDenseLayer(INetworkDefinition* network, ITensor* input, std::map& weightMap, std::string lname, float eps) { // add Batchnorm IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *input, lname + ".norm1", eps); // add relu IActivationLayer* relu1 = network -> addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // add conv Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network -> addConvolutionNd(*relu1->getOutput(0), 128, DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1 -> setStrideNd(DimsHW{1, 1}); // add Batchnorm IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv1 -> getOutput(0), lname + ".norm2", eps); // add relu IActivationLayer* relu2 = network -> addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); // add conv IConvolutionLayer* conv2 = network -> addConvolutionNd(*relu2->getOutput(0), 32, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2 -> setStrideNd(DimsHW{1, 1}); conv2 -> setPaddingNd(DimsHW{1, 1}); return conv2; } IPoolingLayer* addTransition(INetworkDefinition* network, ITensor& input, std::map& weightMap, int outch, std::string lname, float eps) { // add batch norm IScaleLayer* bn1 = addBatchNorm2d(network, weightMap,input, lname + ".norm", eps); // add relu activation IActivationLayer* relu1 = network -> addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // add convolution layer // empty weights for no bias Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network -> addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], emptywts); assert(conv1); conv1 -> setStrideNd(DimsHW{1, 1}); // add pooling IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2}); assert(pool1); pool1 -> setStrideNd(DimsHW{2, 2}); pool1 -> setPaddingNd(DimsHW{0,0}); return pool1; } IConcatenationLayer* addDenseBlock(INetworkDefinition* network, ITensor* input, std::map& weightMap, int numDenseLayers, std::string lname, float eps) { IConvolutionLayer* c{nullptr}; IConcatenationLayer* concat{nullptr}; ITensor* inputTensors[numDenseLayers+1]; inputTensors[0] = input; c = addDenseLayer(network, input, weightMap, lname + ".denselayer" + std::to_string(1), eps); int i; for(i=1; i getOutput(0); concat = network -> addConcatenation(inputTensors, i+1); assert(concat); c = addDenseLayer(network, concat->getOutput(0), weightMap, lname + ".denselayer" + std::to_string(i+1), eps); } inputTensors[numDenseLayers] = c -> getOutput(0); concat = network -> addConcatenation(inputTensors, numDenseLayers+1); assert(concat); return concat; } /** * Uses the TensorRT API to create the network engine. **/ ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { // Initialize NetworkDefinition INetworkDefinition* network = builder -> createNetworkV2(0U); auto data = network -> addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../densenet121.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto conv0 = network -> addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["features.conv0.weight"], emptywts); assert(conv0); conv0 -> setStrideNd(DimsHW{2, 2}); conv0 -> setPaddingNd(DimsHW{3, 3}); auto norm0 = addBatchNorm2d(network, weightMap, *conv0 -> getOutput(0), "features.norm0", 1e-5); auto relu0 = network -> addActivation(*norm0 -> getOutput(0), ActivationType::kRELU); assert(relu0); auto pool0 = network -> addPoolingNd(*relu0 -> getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool0); pool0 -> setStrideNd(DimsHW{2, 2}); pool0 -> setPaddingNd(DimsHW{1, 1}); auto dense1 = addDenseBlock(network, pool0 -> getOutput(0), weightMap, 6, "features.denseblock1", 1e-5); auto transition1 = addTransition(network, *dense1 -> getOutput(0), weightMap, 128, "features.transition1", 1e-5); auto dense2 = addDenseBlock(network, transition1 -> getOutput(0), weightMap, 12, "features.denseblock2", 1e-5); auto transition2 = addTransition(network, *dense2 -> getOutput(0), weightMap, 256, "features.transition2", 1e-5); auto dense3 = addDenseBlock(network, transition2 -> getOutput(0), weightMap, 24, "features.denseblock3", 1e-5); auto transition3 = addTransition(network, *dense3 -> getOutput(0), weightMap, 512, "features.transition3", 1e-5); auto dense4 = addDenseBlock(network, transition3 -> getOutput(0), weightMap, 16, "features.denseblock4", 1e-5); auto bn5 = addBatchNorm2d(network, weightMap, *dense4 -> getOutput(0), "features.norm5", 1e-5); auto relu5 = network -> addActivation(*bn5 -> getOutput(0), ActivationType::kRELU); // adaptive average pool => pytorch (F.adaptive_avg_pool2d(input, (1, 1))) auto pool5 = network -> addPoolingNd(*relu5 -> getOutput(0), PoolingType::kAVERAGE, DimsHW{7,7}); auto fc1 = network -> addFullyConnected(*pool5 -> getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]); assert(fc1); // set ouput blob name fc1 -> getOutput(0) -> setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; // mark the output network -> markOutput(*fc1 -> getOutput(0)); // set batchsize and workspace size builder -> setMaxBatchSize(maxBatchSize); config -> setMaxWorkspaceSize(1 << 28); // 256 MiB // build engine ICudaEngine* engine = builder -> buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // destroy network -> destroy(); // fere host mem for(auto& mem: weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } /** * Performs inference on the given input and * writes the output from device to host memory. **/ void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./densenet -s // serialize model to plan file" << std::endl; std::cerr << "./densenet -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("densenet.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("densenet.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { std::cout << prob[i] << ", "; if (i % 10 == 0) std::cout << i / 10 << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: densenet/densenet121.py ================================================ import os import sys import struct import argparse import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt BATCH_SIZE = 1 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 1000 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" EPS = 1e-5 WEIGHT_PATH = "./densenet121.wts" ENGINE_PATH = "./densenet121.engine" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def add_batch_norm_2d(network, weight_map, input, layer_name): gamma = weight_map[layer_name + ".weight"] beta = weight_map[layer_name + ".bias"] mean = weight_map[layer_name + ".running_mean"] var = weight_map[layer_name + ".running_var"] var = np.sqrt(var + EPS) scale = gamma / var shift = -mean / var * gamma + beta return network.add_scale(input=input, mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale) def add_dense_layer(network, input, weight_map, lname): bn1 = add_batch_norm_2d(network, weight_map, input, lname + ".norm1") relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 conv1 = network.add_convolution(input=relu1.get_output(0), num_output_maps=128, kernel_shape=(1, 1), kernel=weight_map[lname + ".conv1.weight"], bias=trt.Weights()) assert conv1 conv1.stride = (1, 1) bn2 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + ".norm2") relu2 = network.add_activation(bn2.get_output(0), type=trt.ActivationType.RELU) assert relu2 conv2 = network.add_convolution(input=relu2.get_output(0), num_output_maps=32, kernel_shape=(3, 3), kernel=weight_map[lname + ".conv2.weight"], bias=trt.Weights()) assert conv2 conv2.stride = (1, 1) conv2.padding = (1, 1) return conv2 def add_transition(network, input, weight_map, outch, lname): bn1 = add_batch_norm_2d(network, weight_map, input, lname + ".norm") relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 conv1 = network.add_convolution(input=relu1.get_output(0), num_output_maps=outch, kernel_shape=(1, 1), kernel=weight_map[lname + ".conv.weight"], bias=trt.Weights()) assert conv1 conv1.stride = (1, 1) pool1 = network.add_pooling(input=conv1.get_output(0), type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(2, 2)) assert pool1 pool1.stride_nd = (2, 2) pool1.padding_nd = (0, 0) return pool1 def add_dense_block(network, input, weight_map, num_dense_layers, lname): input_tensors = [None for _ in range(num_dense_layers+1)] input_tensors[0] = input c = add_dense_layer(network, input, weight_map, lname + ".denselayer" + str(1)) for i in range(1, num_dense_layers): input_tensors[i] = c.get_output(0) concat = network.add_concatenation(input_tensors[:i+1]) assert concat c = add_dense_layer(network, concat.get_output(0), weight_map, lname + ".denselayer" + str(i+1)) input_tensors[num_dense_layers] = c.get_output(0) concat = network.add_concatenation(input_tensors) assert concat return concat def create_engine(max_batch_size, builder, config, dt): weight_map = load_weights(WEIGHT_PATH) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data conv0 = network.add_convolution(input=data, num_output_maps=64, kernel_shape=(7, 7), kernel=weight_map["features.conv0.weight"], bias=trt.Weights()) assert conv0 conv0.stride = (2, 2) conv0.padding = (3, 3) bn0 = add_batch_norm_2d(network, weight_map, conv0.get_output(0), "features.norm0") relu0 = network.add_activation(bn0.get_output(0), type=trt.ActivationType.RELU) assert relu0 pool0 = network.add_pooling(input=relu0.get_output(0), type=trt.PoolingType.MAX, window_size=trt.DimsHW(3, 3)) assert pool0 pool0.stride_nd = (2, 2) pool0.padding_nd = (1, 1) dense1 = add_dense_block(network, pool0.get_output(0), weight_map, 6, "features.denseblock1") transition1 = add_transition(network, dense1.get_output(0), weight_map, 128, "features.transition1") dense2 = add_dense_block(network, transition1.get_output(0), weight_map, 12, "features.denseblock2") transition2 = add_transition(network, dense2.get_output(0), weight_map, 256, "features.transition2") dense3 = add_dense_block(network, transition2.get_output(0), weight_map, 24, "features.denseblock3") transition3 = add_transition(network, dense3.get_output(0), weight_map, 512, "features.transition3") dense4 = add_dense_block(network, transition3.get_output(0), weight_map, 16, "features.denseblock4") bn5 = add_batch_norm_2d(network, weight_map, dense4.get_output(0), "features.norm5") relu5 = network.add_activation(bn5.get_output(0), type=trt.ActivationType.RELU) pool5 = network.add_pooling(relu5.get_output(0), type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(7, 7)) fc1 = network.add_fully_connected(input=pool5.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map["classifier.weight"], bias=weight_map["classifier.bias"]) assert fc1 fc1.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(fc1.get_output(0)) # Build Engine builder.max_batch_size = max_batch_size builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network del weight_map return engine def API_to_model(max_batch_size): builder = trt.Builder(TRT_LOGGER) config = builder.create_builder_config() engine = create_engine(max_batch_size, builder, config, trt.float32) assert engine with open(ENGINE_PATH, "wb") as f: f.write(engine.serialize()) del engine del builder del config class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') args = parser.parse_args() if not (args.s ^ args.d): print( "arguments not right!\n" "python densenet121.py -s # serialize model to plan file\n" "python densenet121.py -d # deserialize plan file and run inference" ) sys.exit() if args.s: API_to_model(BATCH_SIZE) else: runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32) inputs, outputs, bindings, stream = allocate_buffers(engine) inputs[0].host = data trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}') ================================================ FILE: densenet/logging.h ================================================ /* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) , mPrefix(other.mPrefix) , mShouldLog(other.mShouldLog) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) { ss << " "; } ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR //! ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: detr/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(detr) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/data/app/TensorRT-8.4.3.1/include) link_directories(/data/app/TensorRT-8.4.3.1/lib) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(detr ${PROJECT_SOURCE_DIR}/detr.cpp) target_link_libraries(detr nvinfer) target_link_libraries(detr cudart) target_link_libraries(detr ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: detr/README.md ================================================ # DETR The Pytorch implementation is [facebookresearch/detr](https://github.com/facebookresearch/detr). For details see [End-to-End Object Detection with Transformers](https://ai.facebook.com/research/publications/end-to-end-object-detection-with-transformers). ## Test Environment - GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 - GTX2080Ti / win10 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 / VS2017 ## How to Run 1. generate .wts from pytorch with .pth ``` // git clone https://github.com/facebookresearch/detr.git // go to facebookresearch/detr // download https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth // download https://raw.githubusercontent.com/freedenS/TestImage/main/demo.jpg // copy tensorrtx/detr/gen_wts.py and demo.jpg into facebookresearch/detr python gen_wts.py // a file 'detr.wts' will be generated. ``` 2. build tensorrtx/detr and run ``` // put detr.wts into tensorrtx/detr // go to tensorrtx/detr // update parameters in detr.cpp if your model is trained on custom dataset.The parameters are corresponding to config in detr. mkdir build cd build cmake .. make sudo ./detr -s [.wts] // serialize model to plan file sudo ./detr -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed // For example sudo ./detr -s ../detr.wts detr.engine sudo ./detr -d detr.engine ../samples ``` 3. check the images generated, as follows. _demo.jpg and so on. ## Backbone #### R50 ``` 1.download pretrained model https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth 2.export wts set first parameter in Backbone in gen_wts.py(line 23) to resnet50 set path of pretrained model(line 87 in gen_wts.py) 3.set resnet_type in BuildResNet(line 546 in detr.cpp) to R50 ``` #### R101 ``` 1.download pretrained model https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth 2.export wts set first parameter in Backbone in gen_wts.py(line 23) to resnet101 set path of pretrained model(line 87 in gen_wts.py) 3.set resnet_type in BuildResNet(line 546 in detr.cpp) to R101 ``` ## NOTE - tensorrt use fixed input size, if the size of your data is different from the engine, you need to adjust your data and the result. - image preprocessing with c++ is a little different with python(opencv vs PIL) ## Quantization 1. quantizationType:fp32,fp16,int8. see BuildDETRModel(detr.cpp line 613) for detail. 2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md). ## Latency average cost of doInference(in detr.cpp) from second time with batch=1 under the ubuntu environment above | | fp32 | fp16 | int8 | | ---- | ------- | ------- | ------ | | R50 | 19.57ms | 9.424ms | 8.38ms | | R101 | 30.82ms | 12.4ms | 9.59ms | ================================================ FILE: detr/backbone.hpp ================================================ #pragma once #include #include "common.hpp" enum RESNETTYPE { R18 = 0, R34, R50, R101, R152 }; const std::map> num_blocks_per_stage = { {R18, {2, 2, 2, 2}}, {R34, {3, 4, 6, 3}}, {R50, {3, 4, 6, 3}}, {R101, {3, 4, 23, 3}}, {R152, {3, 8, 36, 3}} }; IScaleLayer* addBatchNorm2d( INetworkDefinition *network, std::unordered_map& weightMap, ITensor& input, const std::string& lname, float eps = 1e-5 ) { float *gamma = (float*)(weightMap[lname + ".weight"].values); float *beta = (float*)(weightMap[lname + ".bias"].values); float *mean = (float*)(weightMap[lname + ".running_mean"].values); float *var = (float*)(weightMap[lname + ".running_var"].values); int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* BasicStem( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& input, int out_channels, int group_num = 1 ) { // conv1 Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd( input, out_channels, DimsHW{ 7, 7 }, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ 2, 2 }); conv1->setPaddingNd(DimsHW{ 3, 3 }); conv1->setNbGroups(group_num); auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1"); assert(bn1); auto r1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(r1); auto max_pool2d = network->addPoolingNd(*r1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 }); max_pool2d->setStrideNd(DimsHW{ 2, 2 }); max_pool2d->setPaddingNd(DimsHW{ 1, 1 }); auto mp_dim = max_pool2d->getOutput(0)->getDimensions(); return max_pool2d; } ITensor* BasicBlock( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& input, int in_channels, int out_channels, int stride = 1 ) { // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd( input, out_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 1, 1 }); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); // conv2 IConvolutionLayer* conv2 = network->addConvolutionNd( *r1->getOutput(0), out_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{ 1, 1 }); conv2->setPaddingNd(DimsHW{ 1, 1 }); // shortcut ITensor* shortcut_value = nullptr; if (in_channels != out_channels) { auto shortcut = network->addConvolutionNd( input, out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".shortcut.weight"], weightMap[lname + ".shortcut.bias"]); assert(shortcut); shortcut->setStrideNd(DimsHW{ stride, stride }); shortcut_value = shortcut->getOutput(0); } else { shortcut_value = &input; } // add auto ew = network->addElementWise(*conv2->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM); assert(ew); auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU); assert(r3); return r3->getOutput(0); } ITensor* BottleneckBlock( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& input, int in_channels, int bottleneck_channels, int out_channels, int stride = 1, int dilation = 1, int group_num = 1 ) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd( input, bottleneck_channels, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ 1, 1 }); conv1->setNbGroups(group_num); auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1"); assert(bn1); auto r1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(r1); // conv2 IConvolutionLayer* conv2 = network->addConvolutionNd( *r1->getOutput(0), bottleneck_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{ stride, stride }); conv2->setPaddingNd(DimsHW{ 1 * dilation, 1 * dilation }); conv2->setDilationNd(DimsHW{ dilation, dilation }); conv2->setNbGroups(group_num); auto bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2"); assert(bn2); auto r2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(r2); // conv3 IConvolutionLayer* conv3 = network->addConvolutionNd( *r2->getOutput(0), out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{ 1, 1 }); conv3->setNbGroups(group_num); auto bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3"); assert(bn3); // shortcut ITensor* shortcut_value = nullptr; if (in_channels != out_channels) { auto shortcut = network->addConvolutionNd( input, out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".downsample.0.weight"], emptywts); assert(shortcut); shortcut->setStrideNd(DimsHW{stride, stride}); shortcut->setNbGroups(group_num); auto shortcut_bn = addBatchNorm2d(network, weightMap, *shortcut->getOutput(0), lname + ".downsample.1"); assert(shortcut_bn); shortcut_value = shortcut_bn->getOutput(0); } else { shortcut_value = &input; } // add auto ew = network->addElementWise(*bn3->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM); assert(ew); auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU); assert(r3); return r3->getOutput(0); } ITensor* MakeStage( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& input, int stage, RESNETTYPE resnet_type, int in_channels, int bottleneck_channels, int out_channels, int first_stride = 1, int dilation = 1 ) { ITensor* out = &input; for (int i = 0; i < stage; i++) { std::string layerName = lname + "." + std::to_string(i); int stride = i == 0 ? first_stride : 1; if (resnet_type == R18 || resnet_type == R34) out = BasicBlock(network, weightMap, layerName, *out, in_channels, out_channels, stride); else out = BottleneckBlock( network, weightMap, layerName, *out, in_channels, bottleneck_channels, out_channels, stride, dilation); in_channels = out_channels; } return out; } ITensor* BuildResNet( INetworkDefinition *network, std::unordered_map& weightMap, ITensor& input, RESNETTYPE resnet_type, int stem_out_channels, int bottleneck_channels, int res2_out_channels, int res5_dilation = 1 ) { assert(res5_dilation == 1 || res5_dilation == 2); // "res5_dilation must be 1 or 2" if (resnet_type == R18 || resnet_type == R34) { assert(res2_out_channels == 64); // "res2_out_channels must be 64 for R18/R34") assert(res5_dilation == 1); // "res5_dilation must be 1 for R18/R34") } int out_channels = res2_out_channels; ITensor* out = nullptr; // stem auto stem = BasicStem(network, weightMap, "backbone.0.body", input, stem_out_channels); out = stem->getOutput(0); // res for (int i = 0; i < 4; i++) { int dilation = (i == 3) ? res5_dilation : 1; int first_stride = (i == 0 || (i == 3 && dilation == 2)) ? 1 : 2; out = MakeStage( network, weightMap, "backbone.0.body.layer" + std::to_string(i + 1), *out, num_blocks_per_stage.at(resnet_type)[i], resnet_type, stem_out_channels, bottleneck_channels, out_channels, first_stride, dilation); stem_out_channels = out_channels; bottleneck_channels *= 2; out_channels *= 2; } return out; } ================================================ FILE: detr/calibrator.hpp ================================================ #pragma once #include "NvInfer.h" #include #include #include #include #include #include #include "common.hpp" #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > static_cast(img_files_.size())) { return false; } std::vector input_imgs_(input_count_, 0); for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } preprocessImg(temp, input_w_, input_h_); for (int c = 0; c < 3; c++) { for (int h = 0; h < input_h_; h++) { for (int w = 0; w < input_w_; w++) { input_imgs_[(i-img_idx_)*input_w_*input_h_*3 + c * input_h_ * input_w_ + h * input_w_ + w] = temp.at(h, w)[c]; } } } } img_idx_ += batchsize_; CUDA_CHECK(cudaMemcpy(device_input_, input_imgs_.data(), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: detr/common.hpp ================================================ #pragma once #include #include #include #include #include #include #include #include #include "./logging.h" #include #include static Logger gLogger; using namespace nvinfer1; void loadWeights(const std::string file, std::unordered_map& weightMap) { std::cout << "Loading weights: " << file << std::endl; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } } int CalculateSize(Dims a) { int res = 1; for (int i = 0; i < a.nbDims; i++) { res *= a.d[i]; } return res; } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { // std::string cur_file_name(p_dir_name); // cur_file_name += "/"; // cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } void preprocessImg(cv::Mat& img, int newh, int neww) { // convert to rgb cv::cvtColor(img, img, cv::COLOR_BGR2RGB); cv::resize(img, img, cv::Size(neww, newh)); img.convertTo(img, CV_32FC3); img /= 255; img -= cv::Scalar(0.485, 0.456, 0.406); img /= cv::Scalar(0.229, 0.224, 0.225); } #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK ================================================ FILE: detr/detr.cpp ================================================ #pragma once #include #include #include "./logging.h" #include "backbone.hpp" #include "calibrator.hpp" #define DEVICE 0 #define BATCH_SIZE 1 // 1 / math.sqrt(head_dim) https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/functional/activation.h#623 static const float SCALING = 0.17677669529663687; static const int INPUT_H = 800; static const int INPUT_W = 1066; static const int NUM_CLASS = 92; // include background static const float SCALING_ONE = 1.0; static const float SHIFT_ZERO = 0.0; static const float POWER_TWO = 2.0; static const float EPS = 0.00001; static const int D_MODEL = 256; static const int NHEAD = 8; static const int DIM_FEEDFORWARD = 2048; static const int NUM_ENCODE_LAYERS = 6; static const int NUM_DECODE_LAYERS = 6; static const int NUM_QUERIES = 100; static const float SCORE_THRESH = 0.5; const char* INPUT_NODE_NAME = "images"; const std::vector OUTPUT_NAMES = { "scores", "boxes"}; ITensor* PositionEmbeddingSine( INetworkDefinition *network, std::unordered_map& weightMap, ITensor& input, int num_pos_feats = 64, int temperature = 10000 ) { // refer to https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py#12 // TODO: improve this implementation auto mask_dim = input.getDimensions(); int h = mask_dim.d[1], w = mask_dim.d[2]; std::vector> y_embed(h); for (int i = 0; i < h; i++) y_embed[i] = std::vector(w, i + 1); std::vector sub_embed(w, 0); for (int i = 0; i < w; i++) sub_embed[i] = i + 1; std::vector> x_embed(h, sub_embed); // normalize float eps = 1e-6, scale = 2.0 * 3.1415926; for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { y_embed[i][j] = y_embed[i][j] / (h + eps) * scale; x_embed[i][j] = x_embed[i][j] / (w + eps) * scale; } } // dim_t std::vector dim_t(num_pos_feats, 0); for (int i = 0; i < num_pos_feats; i++) { dim_t[i] = pow(temperature, (2 * (i / 2) / static_cast(num_pos_feats))); } // pos_x, pos_y std::vector>> pos_x(h, std::vector>(w, std::vector(num_pos_feats, 0))); std::vector>> pos_y(h, std::vector>(w, std::vector(num_pos_feats, 0))); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { for (int k = 0; k < num_pos_feats; k++) { float value_x = x_embed[i][j] / dim_t[k]; float value_y = y_embed[i][j] / dim_t[k]; if (k & 1) { pos_x[i][j][k] = std::cos(value_x); pos_y[i][j][k] = std::cos(value_y); } else { pos_x[i][j][k] = std::sin(value_x); pos_y[i][j][k] = std::sin(value_y); } } } } // pos float *pval = reinterpret_cast(malloc(sizeof(float) * h * w * num_pos_feats * 2)); float *pNext = pval; for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { for (int k = 0; k < num_pos_feats; k++) { *pNext = pos_y[i][j][k]; ++pNext; } for (int k = 0; k < num_pos_feats; k++) { *pNext = pos_x[i][j][k]; ++pNext; } } } Weights pos_embed_weight{ DataType::kFLOAT, pval, h * w * num_pos_feats * 2 }; weightMap["pos"] = pos_embed_weight; auto pos_embed = network->addConstant(Dims4{ h * w, num_pos_feats * 2, 1, 1 }, pos_embed_weight); assert(pos_embed); return pos_embed->getOutput(0); } ITensor* MultiHeadAttention( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& query, ITensor& key, ITensor& value, int embed_dim = 256, int num_heads = 8 ) { int tgt_len = query.getDimensions().d[0]; int head_dim = embed_dim / num_heads; // q auto linear_q = network->addFullyConnected( query, embed_dim, weightMap[lname + ".in_proj_weight_q"], weightMap[lname + ".in_proj_bias_q"]); assert(linear_q); // k auto linear_k = network->addFullyConnected( key, embed_dim, weightMap[lname + ".in_proj_weight_k"], weightMap[lname + ".in_proj_bias_k"]); assert(linear_k); // v auto linear_v = network->addFullyConnected( value, embed_dim, weightMap[lname + ".in_proj_weight_v"], weightMap[lname + ".in_proj_bias_v"]); assert(linear_v); auto scaling_t = network->addConstant(Dims4{ 1, 1, 1, 1 }, Weights{ DataType::kFLOAT, &SCALING, 1 }); assert(scaling_t); auto q_scaling = network->addElementWise( *linear_q->getOutput(0), *scaling_t->getOutput(0), ElementWiseOperation::kPROD); assert(q_scaling); auto q_shuffle = network->addShuffle(*q_scaling->getOutput(0)); assert(q_shuffle); q_shuffle->setName((lname + ".q_shuffle").c_str()); q_shuffle->setReshapeDimensions(Dims3{ -1, num_heads, head_dim }); q_shuffle->setSecondTranspose(Permutation{1, 0, 2}); auto k_shuffle = network->addShuffle(*linear_k->getOutput(0)); assert(k_shuffle); k_shuffle->setName((lname + ".k_shuffle").c_str()); k_shuffle->setReshapeDimensions(Dims3{ -1, num_heads, head_dim }); k_shuffle->setSecondTranspose(Permutation{ 1, 0, 2 }); auto v_shuffle = network->addShuffle(*linear_v->getOutput(0)); assert(v_shuffle); v_shuffle->setName((lname + ".v_shuffle").c_str()); v_shuffle->setReshapeDimensions(Dims3{ -1, num_heads, head_dim }); v_shuffle->setSecondTranspose(Permutation{ 1, 0, 2 }); #if NV_TENSORRT_MAJOR >= 8 auto q_product_k = network->addMatrixMultiply(*q_shuffle->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k_shuffle->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE); #else auto q_product_k = network->addMatrixMultiply(*q_shuffle->getOutput(0), false, *k_shuffle->getOutput(0), true); #endif assert(q_product_k); // src_key_padding_mask are all false, so do nothing here // see https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/functional/activation.h#826-#839 auto softmax = network->addSoftMax(*q_product_k->getOutput(0)); assert(softmax); softmax->setAxes(4); #if NV_TENSORRT_MAJOR >= 8 auto attn_product_v = network->addMatrixMultiply(*softmax->getOutput(0), nvinfer1::MatrixOperation::kNONE, *v_shuffle->getOutput(0), nvinfer1::MatrixOperation::kNONE); #else auto attn_product_v = network->addMatrixMultiply(*softmax->getOutput(0), false, *v_shuffle->getOutput(0), false); #endif assert(attn_product_v); auto attn_shuffle = network->addShuffle(*attn_product_v->getOutput(0)); assert(attn_shuffle); attn_shuffle->setName((lname + ".attn_shuffle").c_str()); attn_shuffle->setFirstTranspose(Permutation{ 1, 0, 2 }); attn_shuffle->setReshapeDimensions(Dims4{ tgt_len, -1, 1, 1 }); auto linear_attn = network->addFullyConnected( *attn_shuffle->getOutput(0), embed_dim, weightMap[lname + ".out_proj.weight"], weightMap[lname + ".out_proj.bias"]); assert(linear_attn); return linear_attn->getOutput(0); } ITensor* LayerNorm( INetworkDefinition *network, ITensor& input, std::unordered_map& weightMap, const std::string& lname, int d_model = 256 ) { // TODO: maybe a better implementation https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/common.cuh#212 auto mean = network->addReduce(input, ReduceOperation::kAVG, 2, true); assert(mean); auto sub_mean = network->addElementWise(input, *mean->getOutput(0), ElementWiseOperation::kSUB); assert(sub_mean); // implement pow2 with scale Weights scale{ DataType::kFLOAT, &SCALING_ONE, 1 }; Weights shift{ DataType::kFLOAT, &SHIFT_ZERO, 1 }; Weights power{ DataType::kFLOAT, &POWER_TWO, 1 }; auto pow2 = network->addScaleNd(*sub_mean->getOutput(0), ScaleMode::kUNIFORM, shift, scale, power, 0); assert(pow2); auto pow_mean = network->addReduce(*pow2->getOutput(0), ReduceOperation::kAVG, 2, true); assert(pow_mean); auto eps = network->addConstant(Dims4{ 1, 1, 1, 1 }, Weights{ DataType::kFLOAT, &EPS, 1 }); assert(eps); auto add_eps = network->addElementWise(*pow_mean->getOutput(0), *eps->getOutput(0), ElementWiseOperation::kSUM); assert(add_eps); auto sqrt = network->addUnary(*add_eps->getOutput(0), UnaryOperation::kSQRT); assert(sqrt); auto div = network->addElementWise(*sub_mean->getOutput(0), *sqrt->getOutput(0), ElementWiseOperation::kDIV); assert(div); float *pval = reinterpret_cast(malloc(sizeof(float) * d_model)); for (int i = 0; i < d_model; i++) { pval[i] = 1.0; } Weights norm1_power{ DataType::kFLOAT, pval, d_model }; weightMap[lname + ".power"] = norm1_power; auto affine = network->addScaleNd( *div->getOutput(0), ScaleMode::kCHANNEL, weightMap[lname + ".bias"], weightMap[lname + ".weight"], norm1_power, 1); assert(affine); return affine->getOutput(0); } ITensor* TransformerEncoderLayer( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& src, ITensor& pos, int d_model = 256, int nhead = 8, int dim_feedforward = 2048 ) { auto pos_embed = network->addElementWise(src, pos, ElementWiseOperation::kSUM); assert(pos_embed); ITensor* src2 = MultiHeadAttention( network, weightMap, lname + ".self_attn", *pos_embed->getOutput(0), *pos_embed->getOutput(0), src, d_model, nhead); auto shortcut1 = network->addElementWise(src, *src2, ElementWiseOperation::kSUM); assert(shortcut1); ITensor* norm1 = LayerNorm(network, *shortcut1->getOutput(0), weightMap, lname + ".norm1"); auto linear1 = network->addFullyConnected( *norm1, dim_feedforward, weightMap[lname + ".linear1.weight"], weightMap[lname + ".linear1.bias"]); assert(linear1); auto relu = network->addActivation(*linear1->getOutput(0), ActivationType::kRELU); assert(relu); auto linear2 = network->addFullyConnected( *relu->getOutput(0), d_model, weightMap[lname + ".linear2.weight"], weightMap[lname + ".linear2.bias"]); assert(linear2); auto shortcut2 = network->addElementWise(*norm1, *linear2->getOutput(0), ElementWiseOperation::kSUM); assert(shortcut2); ITensor* norm2 = LayerNorm(network, *shortcut2->getOutput(0), weightMap, lname + ".norm2"); return norm2; } ITensor* TransformerEncoder( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& src, ITensor& pos, int num_layers = 6 ) { ITensor* out = &src; for (int i = 0; i < num_layers; i++) { std::string layer_name = lname + ".layers." + std::to_string(i); out = TransformerEncoderLayer(network, weightMap, layer_name, *out, pos); } return out; } ITensor* TransformerDecoderLayer( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& tgt, ITensor& memory, ITensor& pos, ITensor& query_pos, int d_model = 256, int nhead = 8, int dim_feedforward = 2048 ) { auto pos_embed = network->addElementWise(tgt, query_pos, ElementWiseOperation::kSUM); assert(pos_embed); ITensor* tgt2 = MultiHeadAttention( network, weightMap, lname + ".self_attn", *pos_embed->getOutput(0), *pos_embed->getOutput(0), tgt); auto shortcut1 = network->addElementWise(tgt, *tgt2, ElementWiseOperation::kSUM); assert(shortcut1); ITensor* norm1 = LayerNorm(network, *shortcut1->getOutput(0), weightMap, lname + ".norm1"); auto query_embed = network->addElementWise(*norm1, query_pos, ElementWiseOperation::kSUM); assert(query_embed); auto key_embed = network->addElementWise(memory, pos, ElementWiseOperation::kSUM); assert(key_embed); ITensor* mha2 = MultiHeadAttention( network, weightMap, lname + ".multihead_attn", *query_embed->getOutput(0), *key_embed->getOutput(0), memory); auto shortcut2 = network->addElementWise(*norm1, *mha2, ElementWiseOperation::kSUM); assert(shortcut2); ITensor* norm2 = LayerNorm(network, *shortcut2->getOutput(0), weightMap, lname + ".norm2"); auto linear1 = network->addFullyConnected( *norm2, dim_feedforward, weightMap[lname + ".linear1.weight"], weightMap[lname + ".linear1.bias"]); assert(linear1); auto relu = network->addActivation(*linear1->getOutput(0), ActivationType::kRELU); assert(relu); auto linear2 = network->addFullyConnected( *relu->getOutput(0), d_model, weightMap[lname + ".linear2.weight"], weightMap[lname + ".linear2.bias"]); assert(linear2); auto shortcut3 = network->addElementWise(*norm2, *linear2->getOutput(0), ElementWiseOperation::kSUM); assert(shortcut3); ITensor* norm3 = LayerNorm(network, *shortcut3->getOutput(0), weightMap, lname + ".norm3"); return norm3; } ITensor* TransformerDecoder( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& tgt, ITensor& memory, ITensor& pos, ITensor& query_pos, int num_layers = 6, int d_model = 256, int nhead = 8, int dim_feedforward = 2048 ) { ITensor* out = &tgt; for (int i = 0; i < num_layers; i++) { std::string layer_name = lname + ".layers." + std::to_string(i); out = TransformerDecoderLayer( network, weightMap, layer_name, *out, memory, pos, query_pos, d_model, nhead, dim_feedforward); } ITensor* norm = LayerNorm(network, *out, weightMap, lname + ".norm", d_model); return norm; } ITensor* Transformer( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& src, ITensor& pos_embed, int num_queries = 100, int num_encoder_layers = 6, int num_decoder_layers = 6, int d_model = 256, int nhead = 8, int dim_feedforward = 2048 ) { auto memory = TransformerEncoder(network, weightMap, lname + ".encoder", src, pos_embed, num_encoder_layers); // construct tgt float *pval = reinterpret_cast(malloc(sizeof(float) * num_queries * d_model)); for (int i = 0; i < num_queries * d_model; i++) { pval[i] = 0.0; } Weights tgt_weight{ DataType::kFLOAT, pval, num_queries * d_model }; weightMap[lname + ".tgt_weight"] = tgt_weight; auto tgt = network->addConstant(Dims4{ num_queries, d_model, 1, 1 }, tgt_weight); assert(tgt); // construct query_pos auto query_pos = network->addConstant(Dims4{ num_queries, d_model, 1, 1 }, weightMap["query_embed.weight"]); assert(query_pos); auto out = TransformerDecoder( network, weightMap, lname + ".decoder", *tgt->getOutput(0), *memory, pos_embed, *query_pos->getOutput(0), num_decoder_layers, d_model, nhead, dim_feedforward); return out; } ITensor* MLP( INetworkDefinition *network, std::unordered_map& weightMap, const std::string& lname, ITensor& src, int num_layers = 3, int hidden_dim = 256, int output_dim = 4 ) { ITensor* out = &src; for (int i = 0; i < num_layers; i++) { std::string layer_name = lname + "." + std::to_string(i); if (i != num_layers - 1) { auto fc = network->addFullyConnected( *out, hidden_dim, weightMap[layer_name + ".weight"], weightMap[layer_name + ".bias"]); assert(fc); auto relu = network->addActivation(*fc->getOutput(0), ActivationType::kRELU); assert(relu); out = relu->getOutput(0); } else { auto fc = network->addFullyConnected( *out, output_dim, weightMap[layer_name + ".weight"], weightMap[layer_name + ".bias"]); assert(fc); out = fc->getOutput(0); } } return out; } std::vector Predict( INetworkDefinition *network, std::unordered_map& weightMap, ITensor& src ) { auto class_embed = network->addFullyConnected( src, NUM_CLASS, weightMap["class_embed.weight"], weightMap["class_embed.bias"]); assert(class_embed); auto class_softmax = network->addSoftMax(*class_embed->getOutput(0)); assert(class_softmax); class_softmax->setAxes(2); ITensor* bbox = MLP(network, weightMap, "bbox_embed.layers", src); auto bbox_sig = network->addActivation(*bbox, ActivationType::kSIGMOID); assert(bbox_sig); std::vector output = { class_softmax->getOutput(0), bbox_sig->getOutput(0) }; return output; } ICudaEngine* createEngine_r50detr( unsigned int maxBatchSize, const std::string& wtsfile, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& modelType = "fp16" ) { /* description: after fuse bn */ INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_NODE_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); // preprocess std::unordered_map weightMap; loadWeights(wtsfile, weightMap); // backbone auto features = BuildResNet(network, weightMap, *data, R50, 64, 64, 256); ITensor* pos_embed = PositionEmbeddingSine(network, weightMap, *features, 128); auto input_proj = network->addConvolutionNd( *features, D_MODEL, DimsHW{ 1, 1 }, weightMap["input_proj.weight"], weightMap["input_proj.bias"]); assert(input_proj); input_proj->setStrideNd(DimsHW{ 1, 1 }); auto flatten = network->addShuffle(*input_proj->getOutput(0)); assert(flatten); flatten->setReshapeDimensions(Dims4{ input_proj->getOutput(0)->getDimensions().d[0], -1, 1, 1 }); flatten->setSecondTranspose(Permutation{ 1, 0, 2, 3 }); auto out1 = Transformer( network, weightMap, "transformer", *flatten->getOutput(0), *pos_embed, NUM_QUERIES, NUM_ENCODE_LAYERS, NUM_DECODE_LAYERS, D_MODEL, NHEAD, DIM_FEEDFORWARD); std::vector results = Predict(network, weightMap, *out1); // build output for (int i = 0; i < results.size(); i++) { network->markOutput(*results[i]); results[i]->setName(OUTPUT_NAMES[i].c_str()); } // build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1ULL << 30); if (modelType == "fp32") { } else if (modelType == "fp16") { config->setFlag(BuilderFlag::kFP16); } else if (modelType == "int8") { std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(BATCH_SIZE, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_NODE_NAME); config->setInt8Calibrator(calibrator); } else { throw("does not support model type"); } std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // destroy network network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void BuildDETRModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& wtsfile, std::string modelType = "fp32") { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine_r50detr(maxBatchSize, wtsfile, builder, config, DataType::kFLOAT, modelType); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, cudaStream_t& stream, std::vector& buffers, std::vector& input, std::vector& output) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input.data(), input.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(BATCH_SIZE, buffers.data(), stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output[0], buffers[1], BATCH_SIZE * NUM_QUERIES * NUM_CLASS * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(output[1], buffers[2], BATCH_SIZE * NUM_QUERIES * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } bool parse_args(int argc, char** argv, std::string& wtsFile, std::string& engineFile, std::string& imgDir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s") { wtsFile = std::string(argv[2]); engineFile = std::string(argv[3]); } else if (std::string(argv[1]) == "-d") { engineFile = std::string(argv[2]); imgDir = std::string(argv[3]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); std::string wtsFile = ""; std::string engineFile = ""; std::string imgDir; if (!parse_args(argc, argv, wtsFile, engineFile, imgDir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./detr -s [.wts] [.engine] // serialize model to plan file" << std::endl; std::cerr << "./detr -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } if (!wtsFile.empty()) { IHostMemory* modelStream{ nullptr }; BuildDETRModel(BATCH_SIZE, &modelStream, wtsFile, "fp32"); assert(modelStream != nullptr); std::ofstream p(engineFile, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } // deserialize the .engine and run inference std::ifstream file(engineFile, std::ios::binary); if (!file.good()) { std::cerr << "read " << engineFile << " error!" << std::endl; return -1; } std::string trtModelStream; size_t modelSize{ 0 }; file.seekg(0, file.end); modelSize = file.tellg(); file.seekg(0, file.beg); trtModelStream.resize(modelSize); assert(!trtModelStream.empty()); file.read(const_cast(trtModelStream.c_str()), modelSize); file.close(); // build engine std::cout << "build engine" << std::endl; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream.c_str(), modelSize); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); runtime->destroy(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // prepare input file std::vector fileList; if (read_files_in_dir(imgDir.c_str(), fileList) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // calculate input size int input_size = CalculateSize(context->getBindingDimensions(0)); // prepare input data std::vector data(BATCH_SIZE * input_size, 0); void *data_d, *scores_d, *boxes_d; CUDA_CHECK(cudaMalloc(&data_d, BATCH_SIZE * input_size * sizeof(float))); CUDA_CHECK(cudaMalloc(&scores_d, BATCH_SIZE * NUM_QUERIES * NUM_CLASS * sizeof(float))); CUDA_CHECK(cudaMalloc(&boxes_d, BATCH_SIZE * NUM_QUERIES * 4 * sizeof(float))); std::vector scores_h(BATCH_SIZE * NUM_QUERIES * NUM_CLASS); std::vector boxes_h(BATCH_SIZE * NUM_QUERIES * 4); std::vector buffers = { data_d, scores_d, boxes_d }; std::vector outputs = {scores_h.data(), boxes_h.data()}; int fcount = 0; int fileLen = fileList.size(); for (int f = 0; f < fileLen; f++) { fcount++; if (fcount < BATCH_SIZE && f + 1 != fileLen) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]); if (img.empty()) continue; preprocessImg(img, INPUT_H, INPUT_W); assert(img.cols * img.rows * 3 == input_size); for (int c = 0; c < 3; c++) { for (int h = 0; h < img.rows; h++) { for (int w = 0; w < img.cols; w++) { data[b * input_size + c * img.rows * img.cols + h * img.cols + w] = img.at(h, w)[c]; } } } } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, stream, buffers, data, outputs); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]); for (int i = 0; i < scores_h.size(); i += NUM_CLASS) { int label = -1; float score = -1; for (int j = i; j < i + NUM_CLASS; j++) { if (score < scores_h[j]) { label = j; score = scores_h[j]; } } if (score > SCORE_THRESH && (label % NUM_CLASS != NUM_CLASS - 1)) { int ind = label / NUM_CLASS; label = label % NUM_CLASS; float cx = boxes_h[ind * 4]; float cy = boxes_h[ind * 4 + 1]; float w = boxes_h[ind * 4 + 2]; float h = boxes_h[ind * 4 + 3]; float x1 = (cx - w / 2.0) * img.cols; float y1 = (cy - h / 2.0) * img.rows; float x2 = (cx + w / 2.0) * img.cols; float y2 = (cy + h / 2.0) * img.rows; cv::Rect r(x1, y1, x2 - x1, y2 - y1); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string(label), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } cv::imwrite("_" + fileList[f - fcount + 1 + b], img); } fcount = 0; } cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(data_d)); CUDA_CHECK(cudaFree(scores_d)); CUDA_CHECK(cudaFree(boxes_d)); context->destroy(); engine->destroy(); return 0; } ================================================ FILE: detr/gen_wts.py ================================================ import cv2 import torch from models.transformer import Transformer from models.position_encoding import PositionEmbeddingSine from models.backbone import Backbone, Joiner from models.detr import DETR import torchvision.transforms as T from PIL import Image import struct def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=-1) def build_backbone(): N_steps = 256 // 2 position_embedding = PositionEmbeddingSine(N_steps, normalize=True) train_backbone = True return_interm_layers = False backbone = Backbone('resnet50', train_backbone, return_interm_layers, False) model = Joiner(backbone, position_embedding) model.num_channels = backbone.num_channels return model def gen_wts(model, filename): f = open(filename + '.wts', 'w') f.write('{}\n'.format(len(model.state_dict().keys()) + 72)) for k, v in model.state_dict().items(): if 'in_proj' in k: dim = int(v.size(0) / 3) q_weight = v[:dim].reshape(-1).cpu().numpy() k_weight = v[dim:2*dim].reshape(-1).cpu().numpy() v_weight = v[2*dim:].reshape(-1).cpu().numpy() f.write('{} {} '.format(k + '_q', len(q_weight))) for vv in q_weight: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') f.write('{} {} '.format(k + '_k', len(k_weight))) for vv in k_weight: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') f.write('{} {} '.format(k + '_v', len(v_weight))) for vv in v_weight: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') else: vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') f.close() def main(): num_classes = 91 device = torch.device('cuda') backbone = build_backbone() transformer = Transformer( d_model=256, dropout=0.1, nhead=8, dim_feedforward=2048, num_encoder_layers=6, num_decoder_layers=6, normalize_before=False, return_intermediate_dec=True, ) model = DETR( backbone, transformer, num_classes=num_classes, num_queries=100, aux_loss=True, ) checkpoint = torch.load('./detr-r50-e632da11.pth') model.load_state_dict(checkpoint['model']) model.to(device) model.eval() gen_wts(model, "detr") # test # with torch.no_grad(): # transform = T.Compose([T.Resize(800), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) # im = Image.open('./image/demo.jpg') # img = transform(im).unsqueeze(0) # img = img.to(device) # res = model(img) # logits = res['pred_logits'] # pred_boxes = res['pred_boxes'] # out_prob = logits.softmax(-1)[0, :, :-1] # keep = out_prob.max(-1).values > 0.5 # label = out_prob[keep].argmax(dim=1) # out_bbox = pred_boxes[0, keep] # out_bbox = out_bbox.to(torch.device('cpu')) # out_bbox = box_cxcywh_to_xyxy(out_bbox) # out_bbox = out_bbox * torch.tensor([640, 480, 640, 480]) # image = cv2.imread('./image/demo.jpg') # for ob in out_bbox: # x0 = int(ob[0].item()) # y0 = int(ob[1].item()) # x1 = int(ob[2].item()) # y1 = int(ob[3].item()) # cv2.rectangle(image, (x0, y0), (x1, y1), (0,0,255), 1) # cv2.imwrite('res.jpg', image) if __name__ == '__main__': main() ================================================ FILE: detr/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: explicit Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: detr/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport)a #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: docker/README.md ================================================ # Tutorials ## Introduction This folder contains the docker and docker-compose file to build the development environment without pain. ## Prerequisites * OS: Linux or WSL2 * docker * nvidia-container-toolkit * (Optional but **recommended**) docker-compose ## Usage 1. (With docker-compose) configure the `.env` file, change `DATA_DIR` to your mount point, such as your code or data folder, etc, comment the `volumes` in docker compose file if not necessariy needed 2. Build image: ```bash docker compose -f docker-compose.yml build ``` 3. Run a container at background: ```bash docker compose -f docker-compose.yml up -d ``` 4. Attach to this container with your IDE and have fun! ## HowTos ### How to build and run with docker? ``` bash docker build -f docker/x86_64.dockerfile -v . docker run -it --gpus all --privileged --net=host --ipc=host -v /bin/bash ``` ### How to build image with other TensorRT version? Change the `TAG` on top of the `.dockerfile`. Note: all images are officially owned by NVIDIA NGC, which requires a registration before pulling. For this repo, the mainly used `TAG` would be: | Container Image | Container OS | Driver | CUDA | TensorRT | Torch | Recommended | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | 20.12-py3 | Ubuntu 20.04 | 455 | 11.2 | 7.2.2 | 1.8.0 | ❌ | | 24.01-py3 | Ubuntu 22.04 | 545 | 12.3 | 8.6.1 | 2.2.0 | ✅ | | 24.04-py3 | Ubuntu 22.04 | 545 | 12.4 | 8.6.3 | 2.3.0 | ✅ | | 24.09-py3 | Ubuntu 22.04 | 560 | 12.6 | 10.4.0 | 2.5.0 | ✅ | For more detail of the support matrix, please check [HERE](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) ### How to customize the opencv in the image? If prebuilt package from apt cannot meet your requirements, please refer to the demo code in `.dockerfile` to build opencv from source. ### How to solve failiures when building image? For *443 timeout* or any similar network issues, a proxy may required. To make your host proxy work for building env of docker, please change the `build` node inside docker-compose file like this: ```YAML build: dockerfile: x86_64.dockerfile args: HTTP_PROXY: ${PROXY} HTTPS_PROXY: ${PROXY} ALL_PROXY: ${PROXY} http_proxy: ${PROXY} https_proxy: ${PROXY} all_proxy: ${PROXY} ``` then add `PROXY="http://xxx:xxx"` in `.env` file ## Note The older version support, like TensorRT version **< 8**, may be deprecated in the future. ================================================ FILE: docker/tensorrtx-docker-compose.yml ================================================ services: tensorrt: image: tensortx:1.0.1 container_name: tensortx environment: - NVIDIA_VISIBLE_DEVICES=all build: dockerfile: x86_64.dockerfile cap_add: - CAP_SYS_ADMIN security_opt: - seccomp:unconfined privileged: true stdin_open: true tty: true shm_size: '8gb' ulimits: memlock: soft: -1 hard: -1 devices: - /dev:/dev:rw volumes: #### user #### - ${HOME}:/workspace/localhome:rw #### custom #### - mount:/mnt:rw deploy: restart_policy: condition: on-failure max_attempts: 1 delay: 5s resources: reservations: devices: - driver: nvidia capabilities: [gpu] count: all volumes: mount: driver: local driver_opts: type: none o: bind device: ${DATA_DIR} ================================================ FILE: docker/x86_64.dockerfile ================================================ ARG TAG=24.01-py3 FROM nvcr.io/nvidia/tensorrt:${TAG} AS tensorrtx ENV DEBIAN_FRONTEND noninteractive # basic tools RUN apt update && apt-get install -y --fix-missing --no-install-recommends \ sudo wget curl git ca-certificates ninja-build tzdata pkg-config \ gdb libglib2.0-dev libmount-dev locales \ && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir yapf isort cmake-format pre-commit ## fix a potential pre-commit error RUN locale-gen "en_US.UTF-8" ## override older cmake RUN find /usr/local/share -type d -name "cmake-*" -exec rm -rf {} + \ && curl -fsSL "https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.sh" \ -o cmake.sh && bash cmake.sh --skip-license --exclude-subdir --prefix=/usr/local && rm cmake.sh RUN apt update && apt-get install -y \ libopencv-dev \ && rm -rf /var/lib/apt/lists/* ## a template to build opencv and opencv_contrib from source # RUN git clone -b 4.x https://github.com/opencv/opencv_contrib.git \ # && git clone -b 4.x https://github.com/opencv/opencv.git opencv \ # && cmake -S opencv -B opencv/build -G Ninja \ # -DBUILD_LIST=core,calib3d,imgproc,imgcodecs,highgui \ # -DOPENCV_EXTRA_MODULES_PATH="/workspace/opencv_contrib/modules" \ # -DCMAKE_BUILD_TYPE=RELEASE \ # -DCMAKE_INSTALL_PREFIX=/usr/local \ # -DENABLE_FAST_MATH=ON \ # -DOPENCV_GENERATE_PKGCONFIG=ON \ # -DBUILD_opencv_python2=OFF \ # -DBUILD_opencv_python3=OFF \ # -DBUILD_JAVA=OFF \ # -DBUILD_DOCS=OFF \ # -DBUILD_PERF_TESTS=OFF \ # -DBUILD_TESTS=OFF \ # && ninja -C opencv/build install ================================================ FILE: efficient_ad/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.12) project(EfficientAD-M) add_definitions(-w) add_definitions(-D API_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE "Debug") set(CMAKE_CUDA_ARCHITECTURES 61 75 86 89) set(THREADS_PREFER_PTHREAD_FLAG ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /od") ### nvcc set(CMAKE_CUDA_COMPILER "D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe") enable_language(CUDA) ### cuda include_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/include") link_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/lib/x64") ### tensorrt set(TRT_DIR "D:/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-8.5.3.1/") include_directories(${TRT_DIR}/include) link_directories(${TRT_DIR}/lib) ### opencv set(OpenCV_DIR "E:/OpenCV/OpenCV_4.6.0/opencv/build") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) ### dirent include_directories("E:/SDK/dirent-1.24/include") include_directories(${PROJECT_SOURCE_DIR}/src/) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(efficientAD_det "./efficientAD_det.cpp" ${SRCS}) target_link_libraries(efficientAD_det nvinfer cudart nvinfer_plugin ${OpenCV_LIBS} ) ================================================ FILE: efficient_ad/README.md ================================================ # EfficientAd EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies. The Pytorch implementation is [openvinotoolkit/anomalib](https://github.com/openvinotoolkit/anomalib).

# Test Environment GTX3080 / Windows10 22H2 / cuda11.8 / cudnn8.9.7 / TensorRT8.5.3 / OpenCV4.6 # How to Run 1. training to generate weight files (`efficientAD_[category].pt`) ``` // Please refer to Anomalib's tutorial for details: // https://github.com/openvinotoolkit/anomalib?tab=readme-ov-file#-training ``` 2. generate `.wts` from pytorch with `.pt` ``` cd ./datas/models/ // copy your `.pt` file to the current directory. python gen_wts.py // a file `efficientAD_[category].wts` will be generated. ``` 3. build and run ``` mkdir build cd build cmake .. make sudo ./EfficientAD-M -s [.wts] // serialize model to plan file sudo ./EfficientAD-M -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed ``` # Latency average cost of doInference(in `efficientad_detect.cpp`) from second time with batch=1 under the windows environment above | | FP32 | | :-----------: | :--: | | EfficientAD-M | 12ms | ================================================ FILE: efficient_ad/efficientAD_det.cpp ================================================ #include #include #include #include #include #include #include "config.h" #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "utils.h" using namespace nvinfer1; static Logger gLogger; // const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; const static int kInputSize = 3 * 256 * 256; const static int kOutputSize = 1 * 256 * 256; bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) { if (argc != 4) return false; if (std::string(argv[1]) == "-s") { wts = std::string(argv[2]); engine = std::string(argv[3]); } else if (std::string(argv[1]) == "-d") { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void prepare_infer_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_output_buffer) { // assert(engine->getNbIOTensors() == 2); assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); // nvinfer1::Dims outputDims = engine->getBindingDimensions(outputIndex); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU in/output buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * 1 * kOutputSize * sizeof(float))); // 3 or 1 ?? // Create CPU output buffers on host *cpu_output_buffer = new float[kBatchSize * kOutputSize]; } void preprocessImg(cv::Mat& img, int newh, int neww) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); cv::resize(img, img, cv::Size(neww, newh)); img.convertTo(img, CV_32FC3); // ImageNet normalize img /= 255.0f; img -= cv::Scalar(0.485, 0.456, 0.406); img /= cv::Scalar(0.229, 0.224, 0.225); } void infer(IExecutionContext& context, cudaStream_t& stream, std::vector& gpu_buffers, std::vector& cpu_input_data, std::vector& cpu_output_data, int batchsize) { // copy input data from host (CPU) to device (GPU) CUDA_CHECK(cudaMemcpyAsync(gpu_buffers[0], cpu_input_data.data(), cpu_input_data.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); // execute inference using context provided by engine context.enqueue(batchsize, gpu_buffers.data(), stream, nullptr); // copy output back from device (GPU) to host (CPU) CUDA_CHECK(cudaMemcpyAsync(cpu_output_data.data(), gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); // synchronize the stream to prevent issues (block CUDA and wait for CUDA operations to be completed) cudaStreamSynchronize(stream); } void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = nullptr; engine = build_efficientAD_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); assert(engine != nullptr); // Serialize the engine IHostMemory* serialized_engine = engine->serialize(); assert(serialized_engine != nullptr); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down engine->destroy(); config->destroy(); serialized_engine->destroy(); builder->destroy(); } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine != nullptr); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; float gd = 1.0f, gw = 1.0f; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./efficientad_det -s [.wts] [.engine] // serialize model to plan file" << std::endl; std::cerr << "./efficientad_det -d [.engine] [../../datas/images/...] // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, gd, gw, wts_name, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); // create CUDA stream for simultaneous CUDA operations cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // prepare cpu and gpu buffers void *gpu_input_buffer, *gpu_output_buffer; CUDA_CHECK(cudaMalloc(&gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc(&gpu_output_buffer, kBatchSize * 1 * kOutputSize * sizeof(float))); // 3 or 1 ?? std::vector gpu_buffers = {gpu_input_buffer, gpu_output_buffer}; std::vector cpu_input_data(kBatchSize * kInputSize, 0); std::vector cpu_output_data(kBatchSize * kOutputSize, 0); // read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } std::vector originImg_batch; for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); originImg_batch.push_back(img.clone()); preprocessImg(img, kInputW, kInputH); assert(img.cols * img.rows * 3 == 3 * 256 * 256); for (int c = 0; c < 3; c++) { for (int h = 0; h < img.rows; h++) { for (int w = 0; w < img.cols; w++) { cpu_input_data[c * img.rows * img.cols + h * img.cols + w] = img.at(h, w)[c]; } } } img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Run inference auto start = std::chrono::system_clock::now(); // infer(*context, stream, (void**)gpu_buffers, cpu_input_data, cpu_output_buffer, kBatchSize); infer(*context, stream, gpu_buffers, cpu_input_data, cpu_output_data, kBatchSize); // change to save into vec `cpu_output_data` auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // postProcess cv::Mat img_1(256, 256, CV_8UC1); for (int row = 0; row < 256; row++) { for (int col = 0; col < 256; col++) { float value = cpu_output_data[row * 256 + col]; if (value < 0) // clip(0,1) value = 0; else if (value > 1) value = 1; img_1.at(row, col) = static_cast(value * 255); } } cv::Mat HeatMap, colorMap; // genHeatMap(img_batch[0], img_1, HeatMap); cv::applyColorMap(img_1, colorMap, cv::COLORMAP_JET); cv::resize(originImg_batch[i], originImg_batch[i], cv::Size(256, 256)); cv::cvtColor(originImg_batch[i], originImg_batch[i], cv::COLOR_RGB2BGR); cv::addWeighted(originImg_batch[i], 0.5, colorMap, 0.5, 0, HeatMap); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_output" + img_name_batch[j], img_1); cv::imwrite("_heatmap" + img_name_batch[j], HeatMap); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(gpu_buffers[0])); CUDA_CHECK(cudaFree(gpu_buffers[1])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: efficient_ad/src/config.h ================================================ #pragma once /* -------------------------------------------------------- * These configs are related to tensorrt model, if these are changed, * please re-compile and re-serialize the tensorrt model. * --------------------------------------------------------*/ // For INT8, you need prepare the calibration dataset, please refer to #define USE_FP32 // set USE_INT8 or USE_FP16 or USE_FP32 // These are used to define input/output tensor names, // you can set them to whatever you want. const static char* kInputTensorName = "data"; const static char* kOutputTensorName = "prob"; constexpr static int kBatchSize = 1; // input width and height must by divisible by 32 constexpr static int kInputH = 256; constexpr static int kInputW = 256; /* -------------------------------------------------------- * These configs are NOT related to tensorrt model, if these are changed, * please re-compile, but no need to re-serialize the tensorrt model. * --------------------------------------------------------*/ // default GPU_id const static int kGpuId = 0; // If your image size is larger than 4096 * 3112, please increase this value const static int kMaxInputImageSize = 4096 * 3112; ================================================ FILE: efficient_ad/src/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: efficient_ad/src/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: efficient_ad/src/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: efficient_ad/src/model.cpp ================================================ #include "model.h" #include #include #include #include #include #include #include #include #include #include "config.h" using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] static std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } void printNetworkLayers(INetworkDefinition* network) { int numLayers = network->getNbLayers(); // std::cout << "currently num of layers: " << numLayers << std::endl; auto dataTypeToString = [](DataType type) { switch (type) { case DataType::kFLOAT: return "kFLOAT"; case DataType::kHALF: return "kHALF"; case DataType::kINT8: return "kINT8"; case DataType::kINT32: return "kINT32"; case DataType::kBOOL: return "kBOOL"; default: return "Unknown"; } }; for (int i = 0; i < numLayers; ++i) { ILayer* layer = network->getLayer(i); std::cout << "--- Layer" << i << " = " << layer->getName() << std::endl; std::cout << "input & output tensor type: " << dataTypeToString(layer->getInput(0)->getType()) << "\t" << dataTypeToString(layer->getOutput(0)->getType()) << std::endl; // input int inTensorNum = layer->getNbInputs(); for (int j = 0; j < inTensorNum; ++j) { // std::cout << layer->getInput(j)->getDimensions().nbDims; Dims dims_in = layer->getInput(j)->getDimensions(); std::cout << "input shape[" << j << "]: ("; for (int k = 0; k < dims_in.nbDims; ++k) { std::cout << dims_in.d[k]; if (k < dims_in.nbDims - 1) { std::cout << ", "; } } std::cout << ")\t"; } std::cout << std::endl; // output int outTensorNum = layer->getNbOutputs(); for (int j = 0; j < outTensorNum; ++j) { // std::cout << layer->getOutput(j)->getName(); Dims dims_out = layer->getOutput(j)->getDimensions(); std::cout << "output shape: ("; for (int k = 0; k < dims_out.nbDims; ++k) { std::cout << dims_out.d[k]; if (k < dims_out.nbDims - 1) { std::cout << ", "; } } std::cout << ")"; } std::cout << "\n" << std::endl; } } static IScaleLayer* NormalizeInput(INetworkDefinition* network, ITensor& input) { float meanValues[3] = {-0.485f, -0.456f, -0.406f}; float stdValues[3] = {1.0f / 0.229f, 1.0f / 0.224f, 1.0f / 0.225f}; Weights meanWeights{DataType::kFLOAT, meanValues, 3}; Weights stdWeights{DataType::kFLOAT, stdValues, 3}; IScaleLayer* NormaLayer = network->addScale(input, ScaleMode::kCHANNEL, meanWeights, stdWeights, Weights{}); assert(NormaLayer != nullptr); return NormaLayer; } static IScaleLayer* NormalizeTeacherMap(INetworkDefinition* network, std::map& weightMap, ITensor& input) { float* mean = (float*)weightMap["mean_std.mean"].values; float* std = (float*)weightMap["mean_std.std"].values; int len = weightMap["mean_std.mean"].count; // 1.scale float* scaleVal = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scaleVal[i] = 1.0 / std[i]; } Weights scale{DataType::kFLOAT, scaleVal, len}; // 2.shift float* shiftVal = nullptr; shiftVal = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shiftVal[i] = -mean[i]; } Weights shift{DataType::kFLOAT, shiftVal, len}; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, Weights{}, Weights{}); assert(scale_1); IScaleLayer* scale_2 = network->addScale(*scale_1->getOutput(0), ScaleMode::kCHANNEL, Weights{}, scale, Weights{}); assert(scale_2); return scale_2; } static ILayer* NormalizeFinalMap(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string name) { float* qa = (float*)weightMap["quantiles.qa_" + name].values; float* qb = (float*)weightMap["quantiles.qb_" + name].values; int len = weightMap["quantiles.qa_" + name].count; Weights qbWeight_2{DataType::kFLOAT, qb, len}; // fmap_st - qa_st float* shiftVal_1 = nullptr; shiftVal_1 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shiftVal_1[i] = -qa[i]; } Weights qa_shiftWeight_1{DataType::kFLOAT, shiftVal_1, len}; IScaleLayer* mapNorm_subLayer_1 = network->addScale(input, ScaleMode::kUNIFORM, qa_shiftWeight_1, Weights{}, Weights{}); assert(mapNorm_subLayer_1); // qb_st - qa_st float* shiftVal_2 = nullptr; shiftVal_2 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shiftVal_2[i] = qb[i] - qa[i]; } // (fmap_st - qa_st) / (qb_st - qa_st) float* scaleVal_1 = nullptr; scaleVal_1 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scaleVal_1[i] = 1.0f / shiftVal_2[i]; } Weights scaleWeight_1{DataType::kFLOAT, scaleVal_1, len}; IScaleLayer* mapNorm_divLayer_1 = network->addScale(*mapNorm_subLayer_1->getOutput(0), ScaleMode::kUNIFORM, Weights{}, scaleWeight_1, Weights{}); assert(mapNorm_divLayer_1); // ((fmap_st - qa_st) / (qb_st - qa_st)) * 0.1 float* scaleVal_2 = nullptr; scaleVal_2 = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scaleVal_2[i] = 0.1f; } Weights scaleWeight_2{DataType::kFLOAT, scaleVal_2, 1}; IScaleLayer* mapNorm_Layer = network->addScale(*mapNorm_divLayer_1->getOutput(0), ScaleMode::kUNIFORM, Weights{}, scaleWeight_2, Weights{}); assert(mapNorm_Layer); return mapNorm_Layer; } static ILayer* convRelu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int g, std::string lname, bool withRelu) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd( input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".weight"], weightMap[lname + ".bias"]); // if without bias weights, the results won't match with torch version assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(g); conv1->setName((lname).c_str()); if (!withRelu) return conv1; auto relu = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } static IResizeLayer* interpolate(INetworkDefinition* network, ITensor& input, Dims upsampleScale, ResizeMode resizeMode) { IResizeLayer* interpolateLayer = network->addResize(input); assert(interpolateLayer); interpolateLayer->setOutputDimensions(upsampleScale); interpolateLayer->setResizeMode(resizeMode); return interpolateLayer; } static ILayer* interpConvRelu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int g, std::string lname, int dim) { IResizeLayer* interpolateLayer = network->addResize(input); assert(interpolateLayer != nullptr); interpolateLayer->setOutputDimensions(Dims3{input.getDimensions().d[0], dim, dim}); interpolateLayer->setResizeMode(ResizeMode::kLINEAR); IConvolutionLayer* conv1 = network->addConvolutionNd(*interpolateLayer->getOutput(0), outch, DimsHW{ksize, ksize}, weightMap[lname + ".weight"], weightMap[lname + ".bias"]); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(g); conv1->setName((lname + ".conv").c_str()); auto relu = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } static IPoolingLayer* avgPool2d(INetworkDefinition* network, ITensor& input, int kernelSize, int stride, int padding) { IPoolingLayer* poolLayer = network->addPooling(input, PoolingType::kAVERAGE, DimsHW{kernelSize, kernelSize}); assert(poolLayer); poolLayer->setStride(DimsHW{stride, stride}); poolLayer->setPadding(DimsHW{padding, padding}); return poolLayer; } static void slice(INetworkDefinition* network, ITensor& input, std::vector& layer_vec) { Dims inputDims = input.getDimensions(); ISliceLayer* slice1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inputDims.d[0] / 2, inputDims.d[1], inputDims.d[2]}, Dims3{1, 1, 1}); assert(slice1); ISliceLayer* slice2 = network->addSlice(input, Dims3{inputDims.d[0] / 2, 0, 0}, Dims3{inputDims.d[0] / 2, inputDims.d[1], inputDims.d[2]}, Dims3{1, 1, 1}); assert(slice2); layer_vec.push_back(slice1->getOutput(0)); layer_vec.push_back(slice2->getOutput(0)); } static IElementWiseLayer* mergeMap(INetworkDefinition* network, ITensor& input1, ITensor& input2) { float* scaleVal = nullptr; scaleVal = reinterpret_cast(malloc(sizeof(float) * 1)); for (int i = 0; i < 1; i++) { scaleVal[i] = 0.5f; } Weights scaleWeight{DataType::kFLOAT, scaleVal, 1}; IScaleLayer* mergeMapLayer1 = network->addScale(input1, ScaleMode::kUNIFORM, Weights{}, scaleWeight, Weights{}); assert(mergeMapLayer1); IScaleLayer* mergeMapLayer2 = network->addScale(input2, ScaleMode::kUNIFORM, Weights{}, scaleWeight, Weights{}); assert(mergeMapLayer2); IElementWiseLayer* mergedMapLayer = network->addElementWise( *mergeMapLayer1->getOutput(0), *mergeMapLayer2->getOutput(0), ElementWiseOperation::kSUM); assert(mergedMapLayer); return mergedMapLayer; } ICudaEngine* build_efficientAD_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) { /* create network object */ INetworkDefinition* network = builder->createNetworkV2(0U); /* create input tensor {3, kInputH, kInputW} */ ITensor* InputData = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(InputData); /* create weight map */ std::map weightMap = loadWeights(wts_name); /* AE */ // auto BN1 = NormalizeInput(network, *InputData); // encoder auto enconv1 = convRelu(network, weightMap, *InputData, 32, 4, 2, 1, 1, "ae.encoder.enconv1", true); auto enconv2 = convRelu(network, weightMap, *enconv1->getOutput(0), 32, 4, 2, 1, 1, "ae.encoder.enconv2", true); auto enconv3 = convRelu(network, weightMap, *enconv2->getOutput(0), 64, 4, 2, 1, 1, "ae.encoder.enconv3", true); auto enconv4 = convRelu(network, weightMap, *enconv3->getOutput(0), 64, 4, 2, 1, 1, "ae.encoder.enconv4", true); auto enconv5 = convRelu(network, weightMap, *enconv4->getOutput(0), 64, 4, 2, 1, 1, "ae.encoder.enconv5", true); auto enconv6 = convRelu(network, weightMap, *enconv5->getOutput(0), 64, 8, 1, 0, 1, "ae.encoder.enconv6", false); // decoder auto deconv1 = interpConvRelu(network, weightMap, *enconv6->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv1", 3); auto deconv2 = interpConvRelu(network, weightMap, *deconv1->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv2", 8); auto deconv3 = interpConvRelu(network, weightMap, *deconv2->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv3", 15); auto deconv4 = interpConvRelu(network, weightMap, *deconv3->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv4", 32); auto deconv5 = interpConvRelu(network, weightMap, *deconv4->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv5", 63); auto deconv6 = interpConvRelu(network, weightMap, *deconv5->getOutput(0), 64, 4, 1, 2, 1, "ae.decoder.deconv6", 127); auto deconv7 = interpConvRelu(network, weightMap, *deconv6->getOutput(0), 64, 3, 1, 1, 1, "ae.decoder.deconv7", 56); auto deconv8 = convRelu(network, weightMap, *deconv7->getOutput(0), 384, 3, 1, 1, 1, "ae.decoder.deconv8", false); /* PDN_medium_teacher */ // no BN added after the convolutional layer auto teacher1 = convRelu(network, weightMap, *InputData, 256, 4, 1, 0, 1, "teacher.conv1", true); auto avgPool1 = avgPool2d(network, *teacher1->getOutput(0), 2, 2, 0); auto teacher2 = convRelu(network, weightMap, *avgPool1->getOutput(0), 512, 4, 1, 0, 1, "teacher.conv2", true); auto avgPool2 = avgPool2d(network, *teacher2->getOutput(0), 2, 2, 0); auto teacher3 = convRelu(network, weightMap, *avgPool2->getOutput(0), 512, 1, 1, 0, 1, "teacher.conv3", true); auto teacher4 = convRelu(network, weightMap, *teacher3->getOutput(0), 512, 3, 1, 0, 1, "teacher.conv4", true); auto teacher5 = convRelu(network, weightMap, *teacher4->getOutput(0), 384, 4, 1, 0, 1, "teacher.conv5", true); auto teacher6 = convRelu(network, weightMap, *teacher5->getOutput(0), 384, 1, 1, 0, 1, "teacher.conv6", false); /* PDN_medium_student */ auto student1 = convRelu(network, weightMap, *InputData, 256, 4, 1, 0, 1, "student.conv1", true); auto avgPool3 = avgPool2d(network, *student1->getOutput(0), 2, 2, 0); auto student2 = convRelu(network, weightMap, *avgPool3->getOutput(0), 512, 4, 1, 0, 1, "student.conv2", true); auto avgPool4 = avgPool2d(network, *student2->getOutput(0), 2, 2, 0); auto student3 = convRelu(network, weightMap, *avgPool4->getOutput(0), 512, 1, 1, 0, 1, "student.conv3", true); auto student4 = convRelu(network, weightMap, *student3->getOutput(0), 512, 3, 1, 0, 1, "student.conv4", true); auto student5 = convRelu(network, weightMap, *student4->getOutput(0), 768, 4, 1, 0, 1, "student.conv5", true); auto student6 = convRelu(network, weightMap, *student5->getOutput(0), 768, 1, 1, 0, 1, "student.conv6", false); /* postCalculate */ auto normal_teacher_output = NormalizeTeacherMap(network, weightMap, *teacher6->getOutput(0)); std::vector layer_vec{}; slice(network, *student6->getOutput(0), layer_vec); ITensor* y_st = layer_vec[0]; ITensor* y_stae = layer_vec[1]; // distance_st IElementWiseLayer* sub_st = network->addElementWise(*normal_teacher_output->getOutput(0), *y_st, ElementWiseOperation::kSUB); assert(sub_st); IElementWiseLayer* distance_st = network->addElementWise(*sub_st->getOutput(0), *sub_st->getOutput(0), ElementWiseOperation::kPROD); assert(distance_st); // distance_stae IElementWiseLayer* sub_stae = network->addElementWise(*deconv8->getOutput(0), *y_stae, ElementWiseOperation::kSUB); assert(sub_stae); IElementWiseLayer* distance_stae = network->addElementWise(*sub_stae->getOutput(0), *sub_stae->getOutput(0), ElementWiseOperation::kPROD); assert(distance_stae); IReduceLayer* map_st = network->addReduce(*distance_st->getOutput(0), ReduceOperation::kAVG, 1, true); assert(map_st); IReduceLayer* map_stae = network->addReduce(*distance_stae->getOutput(0), ReduceOperation::kAVG, 1, true); assert(map_stae); IPaddingLayer* padMap_st = network->addPadding(*map_st->getOutput(0), DimsHW{4, 4}, DimsHW{4, 4}); assert(padMap_st); IPaddingLayer* padMap_stae = network->addPadding(*map_stae->getOutput(0), DimsHW{4, 4}, DimsHW{4, 4}); assert(padMap_stae); IResizeLayer* interpMap_st = interpolate(network, *padMap_st->getOutput(0), Dims3{padMap_st->getOutput(0)->getDimensions().d[0], 256, 256}, ResizeMode::kLINEAR); assert(interpMap_st); IResizeLayer* interpMap_stae = interpolate(network, *padMap_stae->getOutput(0), Dims3{padMap_stae->getOutput(0)->getDimensions().d[0], 256, 256}, ResizeMode::kLINEAR); assert(interpMap_stae); ILayer* normalizedMap_st = NormalizeFinalMap(network, weightMap, *interpMap_st->getOutput(0), "st"); assert(normalizedMap_st); ILayer* normalizedMap_stae = NormalizeFinalMap(network, weightMap, *interpMap_stae->getOutput(0), "ae"); assert(normalizedMap_stae); IElementWiseLayer* mergedMapLayer = mergeMap(network, *normalizedMap_st->getOutput(0), *normalizedMap_st->getOutput(0)); printNetworkLayers(network); /* ouput */ mergedMapLayer->getOutput(0)->setName(kOutputTensorName); network->markOutput(*mergedMapLayer->getOutput(0)); /* Engine config */ builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } ================================================ FILE: efficient_ad/src/model.h ================================================ #pragma once #include #include nvinfer1::ICudaEngine* build_efficientAD_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); ================================================ FILE: efficient_ad/src/postprocess.h ================================================ #pragma once #include void genHeatMap(cv::Mat originImg, cv::Mat& anomalyGrayMap, cv::Mat& HeatMap) { cv::Mat colorMap; cv::applyColorMap(colorMap, anomalyGrayMap, cv::COLORMAP_JET); cv::addWeighted(originImg, 0.5, colorMap, 0.5, 0, HeatMap); } ================================================ FILE: efficient_ad/src/utils.h ================================================ #pragma once #include #include #include #include #include #include #include static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } ================================================ FILE: efficientnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(efficientnet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(efficientnet ${PROJECT_SOURCE_DIR}/efficientnet.cpp) target_link_libraries(efficientnet nvinfer) target_link_libraries(efficientnet cudart) add_definitions(-O2 -pthread) ================================================ FILE: efficientnet/README.md ================================================ # EfficientNet A TensorRT implementation of EfficientNet. For the Pytorch implementation, you can refer to [EfficientNet-PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch) ## How to run 1. install `efficientnet_pytorch` ``` pip install efficientnet_pytorch ``` 2. gennerate `.wts` file ``` python gen_wts.py ``` 3. build ``` mkdir build cd build cmake .. make ``` 4. serialize model to engine ``` ./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7] // serialize model to engine file ``` such as ``` ./efficientnet -s ../efficientnet-b3.wts efficientnet-b3.engine b3 ``` 5. deserialize and do infer ``` ./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7] // deserialize engine file and run inference ``` such as ``` ./efficientnet -d efficientnet-b3.engine b3 ``` 6. see if the output is same as pytorch side For more models, please refer to [tensorrtx](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: efficientnet/efficientnet.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include "utils.hpp" #define USE_FP32 //USE_FP16 #define INPUT_NAME "data" #define OUTPUT_NAME "prob" #define MAX_BATCH_SIZE 8 using namespace nvinfer1; static Logger gLogger; static std::vector block_args_list = { BlockArgs{1, 3, 1, 1, 32, 16, 0.25, true}, BlockArgs{2, 3, 2, 6, 16, 24, 0.25, true}, BlockArgs{2, 5, 2, 6, 24, 40, 0.25, true}, BlockArgs{3, 3, 2, 6, 40, 80, 0.25, true}, BlockArgs{3, 5, 1, 6, 80, 112, 0.25, true}, BlockArgs{4, 5, 2, 6, 112, 192, 0.25, true}, BlockArgs{1, 3, 1, 6, 192, 320, 0.25, true}}; static std::map global_params_map = { // input_h,input_w,num_classes,batch_norm_epsilon, // width_coefficient,depth_coefficient,depth_divisor, min_depth {"b0", GlobalParams{224, 224, 1000, 0.001, 1.0, 1.0, 8, -1}}, {"b1", GlobalParams{240, 240, 1000, 0.001, 1.0, 1.1, 8, -1}}, {"b2", GlobalParams{260, 260, 1000, 0.001, 1.1, 1.2, 8, -1}}, {"b3", GlobalParams{300, 300, 1000, 0.001, 1.2, 1.4, 8, -1}}, {"b4", GlobalParams{380, 380, 1000, 0.001, 1.4, 1.8, 8, -1}}, {"b5", GlobalParams{456, 456, 1000, 0.001, 1.6, 2.2, 8, -1}}, {"b6", GlobalParams{528, 528, 1000, 0.001, 1.8, 2.6, 8, -1}}, {"b7", GlobalParams{600, 600, 1000, 0.001, 2.0, 3.1, 8, -1}}, {"b8", GlobalParams{672, 672, 1000, 0.001, 2.2, 3.6, 8, -1}}, {"l2", GlobalParams{800, 800, 1000, 0.001, 4.3, 5.3, 8, -1}}, }; ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string path_wts, std::vector block_args_list, GlobalParams global_params) { float bn_eps = global_params.batch_norm_epsilon; DimsHW image_size = DimsHW{global_params.input_h, global_params.input_w}; std::map weightMap = loadWeights(path_wts); Weights emptywts{DataType::kFLOAT, nullptr, 0}; INetworkDefinition *network = builder->createNetworkV2(0U); ITensor *data = network->addInput(INPUT_NAME, dt, Dims3{3, global_params.input_h, global_params.input_w}); assert(data); int out_channels = roundFilters(32, global_params); auto conv_stem = addSamePaddingConv2d(network, weightMap, *data, out_channels, 3, 2, 1, 1, image_size, "_conv_stem"); auto bn0 = addBatchNorm2d(network, weightMap, *conv_stem->getOutput(0), "_bn0", bn_eps); auto swish0 = addSwish(network, *bn0->getOutput(0)); ITensor *x = swish0->getOutput(0); image_size = calculateOutputImageSize(image_size, 2); int block_id = 0; for (int i = 0; i < block_args_list.size(); i++) { BlockArgs block_args = block_args_list[i]; block_args.input_filters = roundFilters(block_args.input_filters, global_params); block_args.output_filters = roundFilters(block_args.output_filters, global_params); block_args.num_repeat = roundRepeats(block_args.num_repeat, global_params); x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size); assert(x); block_id++; image_size = calculateOutputImageSize(image_size, block_args.stride); if (block_args.num_repeat > 1) { block_args.input_filters = block_args.output_filters; block_args.stride = 1; } for (int r = 0; r < block_args.num_repeat - 1; r++) { x = MBConvBlock(network, weightMap, *x, "_blocks." + std::to_string(block_id), block_args, global_params, image_size); block_id++; } } out_channels = roundFilters(1280, global_params); auto conv_head = addSamePaddingConv2d(network, weightMap, *x, out_channels, 1, 1, 1, 1, image_size, "_conv_head", false); auto bn1 = addBatchNorm2d(network, weightMap, *conv_head->getOutput(0), "_bn1", bn_eps); auto swish1 = addSwish(network, *bn1->getOutput(0)); auto avg_pool = network->addPoolingNd(*swish1->getOutput(0), PoolingType::kAVERAGE, image_size); IFullyConnectedLayer *final = network->addFullyConnected(*avg_pool->getOutput(0), global_params.num_classes, weightMap["_fc.weight"], weightMap["_fc.bias"]); assert(final); final->getOutput(0)->setName(OUTPUT_NAME); network->markOutput(*final->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "build engine ..." << std::endl; ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); assert(engine != nullptr); std::cout << "build finished" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto &mem : weightMap) { free((void *)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, std::vector block_args_list, GlobalParams global_params) { // Create builder IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, block_args_list, global_params); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext &context, float *input, float *output, int batchSize, GlobalParams global_params) { const ICudaEngine &engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void *buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * global_params.input_h * global_params.input_w * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * global_params.num_classes * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &backbone) { if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); backbone = std::string(argv[4]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); backbone = std::string(argv[3]); } else { return false; } return true; } int main(int argc, char **argv) { std::string wtsPath = ""; std::string engine_name = ""; std::string backbone = ""; if (!parse_args(argc, argv, wtsPath, engine_name, backbone)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7] // serialize model to engine file" << std::endl; std::cerr << "./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7] // deserialize engine file and run inference" << std::endl; return -1; } GlobalParams global_params = global_params_map[backbone]; // create a model using the API directly and serialize it to a stream if (!wtsPath.empty()) { IHostMemory *modelStream{nullptr}; APIToModel(MAX_BATCH_SIZE, &modelStream, wtsPath, block_args_list, global_params); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } char *trtModelStream{nullptr}; size_t size{0}; std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } else { std::cerr << "could not open plan file" << std::endl; return -1; } // dummy input float *data = new float[3 * global_params.input_h * global_params.input_w]; for (int i = 0; i < 3 * global_params.input_h * global_params.input_w; i++) data[i] = 0.1; IRuntime *runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext *context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference float *prob = new float[global_params.num_classes]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1, global_params); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } for (unsigned int i = 0; i < 20; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); delete data; delete prob; return 0; } ================================================ FILE: efficientnet/gen_wts.py ================================================ import torch import struct from efficientnet_pytorch import EfficientNet model = EfficientNet.from_pretrained('efficientnet-b3') model.eval() f = open('efficientnet-b3.wts', 'w') f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') f.close() ================================================ FILE: efficientnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: efficientnet/utils.hpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include using namespace nvinfer1; #define CHECK(status) \ do \ { \ auto ret = (status); \ if (ret != 0) \ { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } struct BlockArgs { int num_repeat; int kernel_size; int stride; float expand_ratio; int input_filters; int output_filters; float se_ratio; bool id_skip; }; struct GlobalParams { int input_h; int input_w; int num_classes; float batch_norm_epsilon; float width_coefficient; float depth_coefficient; int depth_divisor; int min_depth; }; int roundFilters(int filters, GlobalParams global_params) { float multiplier = global_params.width_coefficient; int divisor = global_params.depth_divisor; int min_depth = global_params.min_depth; filters = int(filters * multiplier); if (min_depth < 0) { min_depth = divisor; } // follow the formula transferred from official TensorFlow implementation int new_filters = std::max(min_depth, int(int(filters + divisor / 2) / divisor) * divisor); if (new_filters < 0.9 * filters) // prevent rounding by more than 10% new_filters += divisor; return int(new_filters); } DimsHW calculateOutputImageSize(DimsHW image_size, int stride) { int image_h = int(ceil(float(image_size.h()) / float(stride))); int image_w = int(ceil(float(image_size.w()) / float(stride))); return DimsHW{image_h, image_w}; } int roundRepeats(int repeats, GlobalParams global_params) { float multiplier = global_params.depth_coefficient; // follow the formula transferred from official TensorFlow implementation int new_repeats = int(ceil(multiplier * repeats)); return new_repeats; } IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map &weightMap, ITensor &input, std::string lname, float eps) { float *gamma = (float *)weightMap[lname + ".weight"].values; float *beta = (float *)weightMap[lname + ".bias"].values; float *mean = (float *)weightMap[lname + ".running_mean"].values; float *var = (float *)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer *scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IConvolutionLayer *addSamePaddingConv2d(INetworkDefinition *network, std::map &weightMap, ITensor &input, int outch, int kernel_size, int stride, int dilation, int groups, DimsHW image_size, std::string lname, bool bias = true) { int ih = image_size.h(); int iw = image_size.w(); int kh = kernel_size; int kw = kernel_size; int sh = stride; int sw = stride; int oh = ceil(float(ih) / float(sh)); int ow = ceil(float(iw) / float(sw)); int pad_h = std::max((oh - 1) * stride + (kh - 1) * dilation + 1 - ih, 0); int pad_w = std::max((ow - 1) * stride + (kw - 1) * dilation + 1 - iw, 0); int pad_left = 0; int pad_right = 0; int pad_top = 0; int pad_bottom = 0; if (pad_h > 0 || pad_w > 0) { pad_left = int(pad_w / 2); pad_right = pad_w - int(pad_w / 2); pad_top = int(pad_h / 2); pad_bottom = pad_h - int(pad_h / 2); } Weights bias_wt{DataType::kFLOAT, nullptr, 0}; if (bias) { bias_wt = weightMap[lname + ".bias"]; } IConvolutionLayer *conv = network->addConvolutionNd(input, outch, DimsHW{kh, kw}, weightMap[lname + ".weight"], bias_wt); conv->setPrePadding(DimsHW{pad_top, pad_left}); conv->setPostPadding(DimsHW{pad_bottom, pad_right}); conv->setStrideNd(DimsHW{stride, stride}); conv->setDilationNd(DimsHW{dilation, dilation}); conv->setNbGroups(groups); return conv; } ILayer *addSwish(INetworkDefinition *network, ITensor &input) { //swish auto *sigmoid = network->addActivation(input, ActivationType::kSIGMOID); auto *ew = network->addElementWise(input, *sigmoid->getOutput(0), ElementWiseOperation::kPROD); return ew; } ITensor *MBConvBlock(INetworkDefinition *network, std::map &weightMap, ITensor &input, std::string lname, BlockArgs block_args, GlobalParams global_params, DimsHW image_size) { bool has_se = block_args.se_ratio > 0 && block_args.se_ratio <= 1; bool id_skip = block_args.id_skip; float bn_eps = global_params.batch_norm_epsilon; int input_filters = block_args.input_filters; int output_filters = block_args.output_filters; Weights emptywts{DataType::kFLOAT, nullptr, 0}; ITensor *x = &input; int inp = block_args.input_filters; int oup = int(block_args.input_filters * block_args.expand_ratio); // expand_ratio != 1 if (fabs(block_args.expand_ratio - 1) > 1e-5) { auto expand_conv = addSamePaddingConv2d(network, weightMap, input, oup, 1, 1, 1, 1, image_size, lname + "._expand_conv"); auto bn0 = addBatchNorm2d(network, weightMap, *expand_conv->getOutput(0), lname + "._bn0", bn_eps); auto swish0 = addSwish(network, *bn0->getOutput(0)); x = swish0->getOutput(0); } int k = block_args.kernel_size; int s = block_args.stride; auto depthwise_conv = addSamePaddingConv2d(network, weightMap, *x, oup, k, s, 1, oup, image_size, lname + "._depthwise_conv", false); auto bn1 = addBatchNorm2d(network, weightMap, *depthwise_conv->getOutput(0), lname + "._bn1", bn_eps); //swish auto swish1 = addSwish(network, *bn1->getOutput(0)); x = swish1->getOutput(0); image_size = calculateOutputImageSize(image_size, s); if (has_se) { auto avg_pool = network->addPoolingNd(*x, PoolingType::kAVERAGE, image_size); int num_squeezed_channels = std::max(1, int(input_filters * block_args.se_ratio)); auto se_reduce = addSamePaddingConv2d(network, weightMap, *avg_pool->getOutput(0), num_squeezed_channels, 1, 1, 1, 1, DimsHW{1, 1}, lname + "._se_reduce"); auto swish2 = addSwish(network, *se_reduce->getOutput(0)); auto se_expand = addSamePaddingConv2d(network, weightMap, *swish2->getOutput(0), oup, 1, 1, 1, 1, DimsHW{1, 1}, lname + "._se_expand"); auto *sigmoid = network->addActivation(*se_expand->getOutput(0), ActivationType::kSIGMOID); auto *ew = network->addElementWise(*x, *sigmoid->getOutput(0), ElementWiseOperation::kPROD); x = ew->getOutput(0); } int final_oup = block_args.output_filters; auto project_conv = addSamePaddingConv2d(network, weightMap, *x, final_oup, 1, 1, 1, 1, image_size, lname + "._project_conv"); auto bn2 = addBatchNorm2d(network, weightMap, *project_conv->getOutput(0), lname + "._bn2", bn_eps); x = bn2->getOutput(0); if (id_skip && block_args.stride == 1 && input_filters == output_filters) { auto *ew = network->addElementWise(input, *x, ElementWiseOperation::kSUM); x = ew->getOutput(0); } return x; } ================================================ FILE: ghostnet/README.md ================================================ # GhostNet GhostNetv1 architecture is from the paper "GhostNet: More Features from Cheap Operations" [(https://arxiv.org/abs/1911.11907)](https://arxiv.org/abs/1911.11907). GhostNetv2 architecture is from the paper "GhostNetV2: Enhance Cheap Operation with Long-Range Attention" [(https://arxiv.org/abs/2211.12905)](https://arxiv.org/abs/2211.12905). For the PyTorch implementations, you can refer to [huawei-noah/ghostnet](https://github.com/huawei-noah/ghostnet). Both versions use the following techniques in their TensorRT implementations: - **BatchNorm** layer is implemented by TensorRT's **Scale** layer. - **Ghost Modules** are used to generate more features from cheap operations, as described in the paper. - Replacing `IPoolingLayer` with `IReduceLayer` in TensorRT for Global Average Pooling. The `IReduceLayer` allows you to perform reduction operations (such as sum, average, max) over specified dimensions without being constrained by the kernel size limitations of pooling layers. ## Project Structure ```plaintext ghostnet │ ├── ghostnetv1 │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── ghostnetv1.cpp │ └── logging.h │ ├── ghostnetv2 │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── ghostnetv2.cpp │ └── logging.h │ └── README.md ``` ## Steps to use GhostNet in TensorRT ### 1. Generate `.wts` files for both GhostNetv1 and GhostNetv2 ```bash # For ghostnetv1 python ghostnetv1/gen_wts.py # For ghostnetv2 python ghostnetv2/gen_wts.py ``` ### 2. Build the project ```bash cd tensorrtx/ghostnet mkdir build cd build cmake .. make ``` ### 3. Serialize the models to engine files Use the following commands to serialize the PyTorch models into TensorRT engine files (`ghostnetv1.engine` and `ghostnetv2.engine`): ```bash # For ghostnetv1 sudo ./ghostnetv1 -s # For ghostnetv2 sudo ./ghostnetv2 -s ``` ### 4. Run inference using the engine files Once the engine files are generated, you can run inference with the following commands: ```bash # For ghostnetv1 sudo ./ghostnetv1 -d # For ghostnetv2 sudo ./ghostnetv2 -d ``` ### 5. Verify output Compare the output with the PyTorch implementation from [huawei-noah/ghostnet](https://github.com/huawei-noah/ghostnet) to ensure that the TensorRT results are consistent with the PyTorch model. ================================================ FILE: ghostnet/ghostnetv1/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(ghostnetv1) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(ghostnetv1 ${PROJECT_SOURCE_DIR}/ghostnetv1.cpp) target_link_libraries(ghostnetv1 nvinfer) target_link_libraries(ghostnetv1 cudart) add_definitions(-O2 -pthread) ================================================ FILE: ghostnet/ghostnetv1/gen_wts.py ================================================ """ Creates a GhostNet Model as defined in: GhostNet: More Features from Cheap Operations By Kai Han, Yunhe Wang, Qi Tian, Jianyuan Guo, Chunjing Xu, Chang Xu. https://arxiv.org/abs/1911.11907 Modified from https://github.com/d-li14/mobilenetv3.pytorch and https://github.com/rwightman/pytorch-image-models """ import torch import torch.nn as nn import torch.onnx import struct import torch import torch.nn.functional as F import math def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v def hard_sigmoid(x, inplace: bool = False): if inplace: return x.add_(3.).clamp_(0., 6.).div_(6.) else: return F.relu6(x + 3.) / 6. class SqueezeExcite(nn.Module): def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None, act_layer=nn.ReLU, gate_fn=hard_sigmoid, divisor=4, **_): super(SqueezeExcite, self).__init__() self.gate_fn = gate_fn reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor) self.avg_pool = nn.AdaptiveAvgPool2d(1) self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) self.act1 = act_layer(inplace=True) self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True) def forward(self, x): x_se = self.avg_pool(x) x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) x = x * self.gate_fn(x_se) return x class ConvBnAct(nn.Module): def __init__(self, in_chs, out_chs, kernel_size, stride=1, act_layer=nn.ReLU): super(ConvBnAct, self).__init__() self.conv = nn.Conv2d(in_chs, out_chs, kernel_size, stride, kernel_size//2, bias=False) self.bn1 = nn.BatchNorm2d(out_chs) self.act1 = act_layer(inplace=True) def forward(self, x): x = self.conv(x) x = self.bn1(x) x = self.act1(x) return x class GhostModule(nn.Module): def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True): super(GhostModule, self).__init__() self.oup = oup init_channels = math.ceil(oup / ratio) new_channels = init_channels*(ratio-1) self.primary_conv = nn.Sequential( nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False), nn.BatchNorm2d(init_channels), nn.ReLU(inplace=True) if relu else nn.Sequential(), ) self.cheap_operation = nn.Sequential( nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False), nn.BatchNorm2d(new_channels), nn.ReLU(inplace=True) if relu else nn.Sequential(), ) def forward(self, x): x1 = self.primary_conv(x) x2 = self.cheap_operation(x1) out = torch.cat([x1, x2], dim=1) return out[:, :self.oup, :, :] class GhostBottleneck(nn.Module): """ Ghost bottleneck w/ optional SE""" def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3, stride=1, act_layer=nn.ReLU, se_ratio=0.): super(GhostBottleneck, self).__init__() has_se = se_ratio is not None and se_ratio > 0. self.stride = stride # Point-wise expansion self.ghost1 = GhostModule(in_chs, mid_chs, relu=True) # Depth-wise convolution if self.stride > 1: self.conv_dw = nn.Conv2d(mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False) self.bn_dw = nn.BatchNorm2d(mid_chs) # Squeeze-and-excitation if has_se: self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio) else: self.se = None # Point-wise linear projection self.ghost2 = GhostModule(mid_chs, out_chs, relu=False) # shortcut if (in_chs == out_chs and self.stride == 1): self.shortcut = nn.Sequential() else: self.shortcut = nn.Sequential( nn.Conv2d(in_chs, in_chs, dw_kernel_size, stride=stride, padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False), nn.BatchNorm2d(in_chs), nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(out_chs), ) def forward(self, x): residual = x # 1st ghost bottleneck x = self.ghost1(x) # Depth-wise convolution if self.stride > 1: x = self.conv_dw(x) x = self.bn_dw(x) # Squeeze-and-excitation if self.se is not None: x = self.se(x) # 2nd ghost bottleneck x = self.ghost2(x) x += self.shortcut(residual) return x class GhostNet(nn.Module): def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2): super(GhostNet, self).__init__() # setting of inverted residual blocks self.cfgs = cfgs self.dropout = dropout # building first layer output_channel = _make_divisible(16 * width, 4) self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False) self.bn1 = nn.BatchNorm2d(output_channel) self.act1 = nn.ReLU(inplace=True) input_channel = output_channel # building inverted residual blocks stages = [] block = GhostBottleneck for cfg in self.cfgs: layers = [] for k, exp_size, c, se_ratio, s in cfg: output_channel = _make_divisible(c * width, 4) hidden_channel = _make_divisible(exp_size * width, 4) layers.append(block(input_channel, hidden_channel, output_channel, k, s, se_ratio=se_ratio)) input_channel = output_channel stages.append(nn.Sequential(*layers)) output_channel = _make_divisible(exp_size * width, 4) stages.append(nn.Sequential(ConvBnAct(input_channel, output_channel, 1))) input_channel = output_channel self.blocks = nn.Sequential(*stages) # building last several layers output_channel = 1280 self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) self.conv_head = nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=True) self.act2 = nn.ReLU(inplace=True) self.classifier = nn.Linear(output_channel, num_classes) def forward(self, x): x = self.conv_stem(x) x = self.bn1(x) x = self.act1(x) x = self.blocks(x) x = self.global_pool(x) x = self.conv_head(x) x = self.act2(x) x = x.view(x.size(0), -1) if self.dropout > 0.: x = F.dropout(x, p=self.dropout, training=self.training) x = self.classifier(x) return x def ghostnet(**kwargs): """ Constructs a GhostNet model """ cfgs = [ # k, t, c, SE, s # stage1 [[3, 16, 16, 0, 1]], # stage2 [[3, 48, 24, 0, 2]], [[3, 72, 24, 0, 1]], # stage3 [[5, 72, 40, 0.25, 2]], [[5, 120, 40, 0.25, 1]], # stage4 [[3, 240, 80, 0, 2]], [[3, 200, 80, 0, 1], [3, 184, 80, 0, 1], [3, 184, 80, 0, 1], [3, 480, 112, 0.25, 1], [3, 672, 112, 0.25, 1]], # stage5 [[5, 672, 160, 0.25, 2]], [[5, 960, 160, 0, 1], [5, 960, 160, 0.25, 1], [5, 960, 160, 0, 1], [5, 960, 160, 0.25, 1]] ] return GhostNet(cfgs, **kwargs) def setup_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True # Function to export weights in the specified format def export_weight(model): f = open("ghostnetv1.weights", 'w') f.write("{}\n".format(len(model.state_dict().keys()))) # Convert weights to hexadecimal format for k, v in model.state_dict().items(): print('exporting ... {}: {}'.format(k, v.shape)) # Reshape the weights to 1D vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") f.close() # Function to evaluate the model (optional) def eval_model(input, model): output = model(input) print("------from inference------") print(input) print(output) if __name__ == "__main__": setup_seed(1) model = ghostnet(num_classes=1000, width=1.0, dropout=0.2) model.eval() input = torch.full((32, 3, 320, 256), 10.0) export_weight(model) eval_model(input, model) ================================================ FILE: ghostnet/ghostnetv1/ghostnetv1.cpp ================================================ #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" using namespace std; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != 0) { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 256; static const int INPUT_W = 320; static const int OUTPUT_SIZE = 1000; static const int batchSize = 32; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); if (!input.is_open()) { std::cerr << "Unable to load weight file." << std::endl; exit(EXIT_FAILURE); } // Read number of weight blobs int32_t count; input >> count; if (count <= 0) { std::cerr << "Invalid weight map file." << std::endl; exit(EXIT_FAILURE); } while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(uint32_t) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } int _make_divisible(int v, int divisor, int min_value = -1) { if (min_value == -1) { min_value = divisor; } int new_v = std::max(min_value, (v + divisor / 2) / divisor * divisor); if (new_v < static_cast(0.9 * v)) { new_v += divisor; } return new_v; } ILayer* hardSigmoid(INetworkDefinition* network, ITensor& input) { IActivationLayer* scale_layer = network->addActivation(input, ActivationType::kHARD_SIGMOID); return scale_layer; } IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* convBnReluStem(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + ".weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); // Stride = 2 conv1->setPaddingNd(DimsHW{1, 1}); // Padding = 1 IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); return relu1; } ILayer* convBnAct(INetworkDefinition* network, std::map& weightMap, ITensor& input, int out_channels, std::string lname, ActivationType actType = ActivationType::kRELU) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv = network->addConvolutionNd(input, out_channels, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], emptywts); assert(conv); conv->setStrideNd(DimsHW{1, 1}); IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer* act = network->addActivation(*bn->getOutput(0), actType); assert(act); return act; } ILayer* squeezeExcite(INetworkDefinition* network, ITensor& input, std::map& weightMap, int in_chs, float se_ratio = 0.25, std::string lname = "", float eps = 1e-5) { IReduceLayer* avg_pool = network->addReduce(input, ReduceOperation::kAVG, 1 << 2 | 1 << 3, true); assert(avg_pool); // Reduce channels with 1x1 convolution int reduced_chs = _make_divisible(static_cast(in_chs * se_ratio), 4); IConvolutionLayer* conv_reduce = network->addConvolutionNd(*avg_pool->getOutput(0), reduced_chs, DimsHW{1, 1}, weightMap[lname + ".conv_reduce.weight"], weightMap[lname + ".conv_reduce.bias"]); assert(conv_reduce); IActivationLayer* relu1 = network->addActivation(*conv_reduce->getOutput(0), ActivationType::kRELU); assert(relu1); // Expand channels back with another 1x1 convolution IConvolutionLayer* conv_expand = network->addConvolutionNd(*relu1->getOutput(0), in_chs, DimsHW{1, 1}, weightMap[lname + ".conv_expand.weight"], weightMap[lname + ".conv_expand.bias"]); assert(conv_expand); cout << "SE conv_expand -> " << printTensorShape(conv_expand->getOutput(0)) << endl; // Apply hardSigmoid function ILayer* hard_sigmoid = hardSigmoid(network, *conv_expand->getOutput(0)); cout << "hard_sigmoid conv_expand -> " << printTensorShape(hard_sigmoid->getOutput(0)) << endl; // Elementwise multiplication of input and gated SE output IElementWiseLayer* scale = network->addElementWise(input, *hard_sigmoid->getOutput(0), ElementWiseOperation::kPROD); assert(scale); return scale; } ILayer* ghostModule(INetworkDefinition* network, ITensor& input, std::map& weightMap, int inp, int oup, int kernel_size = 1, int ratio = 2, int dw_size = 3, int stride = 1, bool relu = true, std::string lname = "") { int init_channels = std::ceil(oup / ratio); int new_channels = init_channels * (ratio - 1); // Primary convolution IConvolutionLayer* primary_conv = network->addConvolutionNd(input, init_channels, DimsHW{kernel_size, kernel_size}, weightMap[lname + ".primary_conv.0.weight"], Weights{}); primary_conv->setStrideNd(DimsHW{stride, stride}); primary_conv->setPaddingNd(DimsHW{kernel_size / 2, kernel_size / 2}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *primary_conv->getOutput(0), lname + ".primary_conv.1", 1e-5); // Cheap operation (Depthwise Convolution) IConvolutionLayer* cheap_conv = network->addConvolutionNd(*bn1->getOutput(0), new_channels, DimsHW{dw_size, dw_size}, weightMap[lname + ".cheap_operation.0.weight"], Weights{}); cheap_conv->setStrideNd(DimsHW{1, 1}); cheap_conv->setPaddingNd(DimsHW{dw_size / 2, dw_size / 2}); cheap_conv->setNbGroups(init_channels); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *cheap_conv->getOutput(0), lname + ".cheap_operation.1", 1e-5); // Define relu1 and relu2 IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); // Initialize inputs array based on the `relu` flag std::vector inputs_vec; if (relu) { inputs_vec = {relu1->getOutput(0), relu2->getOutput(0)}; } else { inputs_vec = {bn1->getOutput(0), bn2->getOutput(0)}; } ITensor* inputs[] = {inputs_vec[0], inputs_vec[1]}; IConcatenationLayer* concat = network->addConcatenation(inputs, 2); std::cout << printTensorShape(concat->getOutput(0)) << std::endl; // Slice the output to keep only the first `oup` channels Dims start{4, {0, 0, 0, 0}}; // Starting from batch=0, channel=0, height=0, width=0 Dims size{4, {concat->getOutput(0)->getDimensions().d[0], oup, concat->getOutput(0)->getDimensions().d[2], concat->getOutput(0) ->getDimensions() .d[3]}}; // Keep all batches, first `oup` channels, all heights and widths Dims stride_{4, {1, 1, 1, 1}}; // Stride is 1 for all dimensions ISliceLayer* slice = network->addSlice(*concat->getOutput(0), start, size, stride_); cout << "slice" << printTensorShape(slice->getOutput(0)) << endl; return slice; } ILayer* ghostBottleneck(INetworkDefinition* network, ITensor& input, std::map& weightMap, int in_chs, int mid_chs, int out_chs, int dw_kernel_size = 3, int stride = 1, float se_ratio = 0.0f, std::string lname = "") { ILayer* ghost1 = ghostModule(network, input, weightMap, in_chs, mid_chs, 1, 2, 3, 1, true, lname + ".ghost1"); ILayer* depthwise_conv = ghost1; if (stride > 1) { IConvolutionLayer* conv_dw = network->addConvolutionNd(*ghost1->getOutput(0), mid_chs, DimsHW{dw_kernel_size, dw_kernel_size}, weightMap[lname + ".conv_dw.weight"], Weights{}); conv_dw->setStrideNd(DimsHW{stride, stride}); conv_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2}); conv_dw->setNbGroups(mid_chs); // Depth-wise convolution IScaleLayer* bn_dw = addBatchNorm2d(network, weightMap, *conv_dw->getOutput(0), lname + ".bn_dw", 1e-5); depthwise_conv = bn_dw; } ILayer* se_layer = depthwise_conv; if (se_ratio > 0.0f) { se_layer = squeezeExcite(network, *depthwise_conv->getOutput(0), weightMap, mid_chs, se_ratio, lname + ".se"); } ILayer* ghost2 = ghostModule(network, *se_layer->getOutput(0), weightMap, mid_chs, out_chs, 1, 2, 3, 1, false, lname + ".ghost2"); ILayer* shortcut_layer = nullptr; if (in_chs == out_chs && stride == 1) { shortcut_layer = network->addIdentity(input); } else { IConvolutionLayer* conv_shortcut_dw = network->addConvolutionNd(input, in_chs, DimsHW{dw_kernel_size, dw_kernel_size}, weightMap[lname + ".shortcut.0.weight"], Weights{}); conv_shortcut_dw->setStrideNd(DimsHW{stride, stride}); conv_shortcut_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2}); conv_shortcut_dw->setNbGroups(in_chs); // Depth-wise convolution IScaleLayer* bn_shortcut_dw = addBatchNorm2d(network, weightMap, *conv_shortcut_dw->getOutput(0), lname + ".shortcut.1", 1e-5); IConvolutionLayer* conv_shortcut_pw = network->addConvolutionNd(*bn_shortcut_dw->getOutput(0), out_chs, DimsHW{1, 1}, weightMap[lname + ".shortcut.2.weight"], Weights{}); IScaleLayer* bn_shortcut_pw = addBatchNorm2d(network, weightMap, *conv_shortcut_pw->getOutput(0), lname + ".shortcut.3", 1e-5); shortcut_layer = bn_shortcut_pw; } IElementWiseLayer* ew_sum = network->addElementWise(*ghost2->getOutput(0), *shortcut_layer->getOutput(0), ElementWiseOperation::kSUM); return ew_sum; } ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // Create input tensor of shape {batchSize, 3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{batchSize, 3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../ghostnetv1.weights"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // Conv Stem IActivationLayer* conv_stem = convBnReluStem(network, weightMap, *data, 16, "conv_stem"); ILayer* current_layer = conv_stem; current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 16, 16, 3, 1, 0, "blocks.0.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 48, 24, 3, 2, 0, "blocks.1.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 24, 3, 1, 0, "blocks.2.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 40, 5, 2, 0.25, "blocks.3.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 120, 40, 5, 1, 0.25, "blocks.4.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 240, 80, 3, 2, 0, "blocks.5.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 200, 80, 3, 1, 0, "blocks.6.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0, "blocks.6.1"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0, "blocks.6.2"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 480, 112, 3, 1, 0.25, "blocks.6.3"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 112, 3, 1, 0.25, "blocks.6.4"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 160, 5, 2, 0.25, "blocks.7.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0, "blocks.8.0"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25, "blocks.8.1"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0, "blocks.8.2"); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25, "blocks.8.3"); // Apply ConvBnAct current_layer = convBnAct(network, weightMap, *current_layer->getOutput(0), 960, "blocks.9.0"); // Global Average Pooling IReduceLayer* global_pool = network->addReduce(*current_layer->getOutput(0), ReduceOperation::kAVG, 1 << 2 | 1 << 3, true); assert(global_pool); // Conv Head IConvolutionLayer* conv_head = network->addConvolutionNd( *global_pool->getOutput(0), 1280, DimsHW{1, 1}, weightMap["conv_head.weight"], weightMap["conv_head.bias"]); IActivationLayer* act2 = network->addActivation(*conv_head->getOutput(0), ActivationType::kRELU); // Fully Connected Layer (Classifier) IFullyConnectedLayer* classifier = network->addFullyConnected( *act2->getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]); classifier->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*classifier->getOutput(0)); // Build engine config->setMaxWorkspaceSize(1 << 24); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); config->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Pointers to input and output device buffers to pass to engine. void* buffers[2]; // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./ghostnetv1 -s // serialize model to plan file" << std::endl; std::cerr << "./ghostnetv1 -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char* trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(&modelStream); assert(modelStream != nullptr); std::ofstream p("ghostnetv1.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file("ghostnetv1.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } float* data = new float[batchSize * 3 * INPUT_H * INPUT_W]; for (int i = 0; i < batchSize * 3 * INPUT_H * INPUT_W; i++) data[i] = 10.0; float* prob = new float[batchSize * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; doInference(*context, data, prob, batchSize); std::cout << "\nOutput:\n\n"; for (int i = 0; i < batchSize; i++) { std::cout << "Batch " << i << ":\n"; for (unsigned int j = 0; j < OUTPUT_SIZE; j++) { std::cout << prob[i * OUTPUT_SIZE + j] << ", "; if (j % 10 == 0) std::cout << j / 10 << std::endl; } std::cout << "\n"; } context->destroy(); engine->destroy(); runtime->destroy(); delete[] data; delete[] prob; return 0; } ================================================ FILE: ghostnet/ghostnetv1/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) noexcept override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: ghostnet/ghostnetv2/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(ghostnetv2) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(ghostnetv2 ${PROJECT_SOURCE_DIR}/ghostnetv2.cpp) target_link_libraries(ghostnetv2 nvinfer) target_link_libraries(ghostnetv2 cudart) add_definitions(-O2 -pthread) ================================================ FILE: ghostnet/ghostnetv2/gen_wts.py ================================================ import torch import torch.nn as nn import torch.onnx import struct import torch import torch.nn.functional as F import math from timm.models.registry import register_model def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v def hard_sigmoid(x, inplace: bool = False): if inplace: return x.add_(3.).clamp_(0., 6.).div_(6.) else: return F.relu6(x + 3.) / 6. class SqueezeExcite(nn.Module): def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None, act_layer=nn.ReLU, gate_fn=hard_sigmoid, divisor=4, **_): super(SqueezeExcite, self).__init__() self.gate_fn = gate_fn reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor) self.avg_pool = nn.AdaptiveAvgPool2d(1) self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) self.act1 = act_layer(inplace=True) self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True) def forward(self, x): x_se = self.avg_pool(x) x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) x = x * self.gate_fn(x_se) return x class ConvBnAct(nn.Module): def __init__(self, in_chs, out_chs, kernel_size, stride=1, act_layer=nn.ReLU): super(ConvBnAct, self).__init__() self.conv = nn.Conv2d(in_chs, out_chs, kernel_size, stride, kernel_size//2, bias=False) self.bn1 = nn.BatchNorm2d(out_chs) self.act1 = act_layer(inplace=True) def forward(self, x): x = self.conv(x) x = self.bn1(x) x = self.act1(x) return x class GhostModuleV2(nn.Module): def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True, mode=None, args=None): super(GhostModuleV2, self).__init__() self.mode = mode self.gate_fn = nn.Sigmoid() if self.mode in ['original']: self.oup = oup init_channels = math.ceil(oup / ratio) new_channels = init_channels*(ratio-1) self.primary_conv = nn.Sequential( nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False), nn.BatchNorm2d(init_channels), nn.ReLU(inplace=True) if relu else nn.Sequential(), ) self.cheap_operation = nn.Sequential( nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False), nn.BatchNorm2d(new_channels), nn.ReLU(inplace=True) if relu else nn.Sequential(), ) elif self.mode in ['attn']: self.oup = oup init_channels = math.ceil(oup / ratio) new_channels = init_channels*(ratio-1) self.primary_conv = nn.Sequential( nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False), nn.BatchNorm2d(init_channels), nn.ReLU(inplace=True) if relu else nn.Sequential(), ) self.cheap_operation = nn.Sequential( nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False), nn.BatchNorm2d(new_channels), nn.ReLU(inplace=True) if relu else nn.Sequential(), ) self.short_conv = nn.Sequential( nn.Conv2d(inp, oup, kernel_size, stride, kernel_size//2, bias=False), nn.BatchNorm2d(oup), nn.Conv2d(oup, oup, kernel_size=(1, 5), stride=1, padding=(0, 2), groups=oup, bias=False), nn.BatchNorm2d(oup), nn.Conv2d(oup, oup, kernel_size=(5, 1), stride=1, padding=(2, 0), groups=oup, bias=False), nn.BatchNorm2d(oup), ) def forward(self, x): if self.mode in ['original']: x1 = self.primary_conv(x) x2 = self.cheap_operation(x1) out = torch.cat([x1, x2], dim=1) return out[:, :self.oup, :, :] elif self.mode in ['attn']: res = self.short_conv(F.avg_pool2d(x, kernel_size=2, stride=2)) x1 = self.primary_conv(x) x2 = self.cheap_operation(x1) out = torch.cat([x1, x2], dim=1) return out[:, :self.oup, :, :]*F.interpolate(self.gate_fn(res), size=(out.shape[-2], out.shape[-1]), mode='nearest') class GhostBottleneckV2(nn.Module): def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3, stride=1, act_layer=nn.ReLU, se_ratio=0., layer_id=None, args=None): super(GhostBottleneckV2, self).__init__() has_se = se_ratio is not None and se_ratio > 0. self.stride = stride # Point-wise expansion if layer_id <= 1: self.ghost1 = GhostModuleV2(in_chs, mid_chs, relu=True, mode='original', args=args) else: self.ghost1 = GhostModuleV2(in_chs, mid_chs, relu=True, mode='attn', args=args) # Depth-wise convolution if self.stride > 1: self.conv_dw = nn.Conv2d(mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False) self.bn_dw = nn.BatchNorm2d(mid_chs) # Squeeze-and-excitation if has_se: self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio) else: self.se = None self.ghost2 = GhostModuleV2(mid_chs, out_chs, relu=False, mode='original', args=args) # shortcut if (in_chs == out_chs and self.stride == 1): self.shortcut = nn.Sequential() else: self.shortcut = nn.Sequential( nn.Conv2d(in_chs, in_chs, dw_kernel_size, stride=stride, padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False), nn.BatchNorm2d(in_chs), nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False), nn.BatchNorm2d(out_chs), ) def forward(self, x): residual = x x = self.ghost1(x) if self.stride > 1: x = self.conv_dw(x) x = self.bn_dw(x) if self.se is not None: x = self.se(x) x = self.ghost2(x) x += self.shortcut(residual) return x class GhostNetV2(nn.Module): def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2, block=GhostBottleneckV2, args=None): super(GhostNetV2, self).__init__() self.cfgs = cfgs self.dropout = dropout # building first layer output_channel = _make_divisible(16 * width, 4) self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False) self.bn1 = nn.BatchNorm2d(output_channel) self.act1 = nn.ReLU(inplace=True) input_channel = output_channel # building inverted residual blocks stages = [] layer_id = 0 for cfg in self.cfgs: layers = [] for k, exp_size, c, se_ratio, s in cfg: output_channel = _make_divisible(c * width, 4) hidden_channel = _make_divisible(exp_size * width, 4) layers.append(block(input_channel, hidden_channel, output_channel, k, s, se_ratio=se_ratio, layer_id=layer_id, args=args)) input_channel = output_channel layer_id += 1 stages.append(nn.Sequential(*layers)) output_channel = _make_divisible(exp_size * width, 4) stages.append(nn.Sequential(ConvBnAct(input_channel, output_channel, 1))) input_channel = output_channel self.blocks = nn.Sequential(*stages) # building last several layers output_channel = 1280 self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) self.conv_head = nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=True) self.act2 = nn.ReLU(inplace=True) self.classifier = nn.Linear(output_channel, num_classes) def forward(self, x): x = self.conv_stem(x) x = self.bn1(x) x = self.act1(x) x = self.blocks(x) x = self.global_pool(x) x = self.conv_head(x) x = self.act2(x) x = x.view(x.size(0), -1) if self.dropout > 0.: x = F.dropout(x, p=self.dropout, training=self.training) x = self.classifier(x) return x @register_model def ghostnetv2(**kwargs): cfgs = [ # k, t, c, SE, s [[3, 16, 16, 0, 1]], [[3, 48, 24, 0, 2]], [[3, 72, 24, 0, 1]], [[5, 72, 40, 0.25, 2]], [[5, 120, 40, 0.25, 1]], [[3, 240, 80, 0, 2]], [[3, 200, 80, 0, 1], [3, 184, 80, 0, 1], [3, 184, 80, 0, 1], [3, 480, 112, 0.25, 1], [3, 672, 112, 0.25, 1]], [[5, 672, 160, 0.25, 2]], [[5, 960, 160, 0, 1], [5, 960, 160, 0.25, 1], [5, 960, 160, 0, 1], [5, 960, 160, 0.25, 1]] ] return GhostNetV2(cfgs, num_classes=kwargs['num_classes'], width=kwargs['width'], dropout=kwargs['dropout'], args=kwargs['args']) def setup_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True # Function to export weights in the specified format def export_weight(model): f = open("ghostnetv2.weights", 'w') f.write("{}\n".format(len(model.state_dict().keys()))) # Convert weights to hexadecimal format for k, v in model.state_dict().items(): print('exporting ... {}: {}'.format(k, v.shape)) # Reshape the weights to 1D vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") f.close() # Function to evaluate the model (optional) def eval_model(input, model): output = model(input) print("------from inference------") print(input) print(output) if __name__ == "__main__": setup_seed(1) # Create an instance of GhostNetV2 model = ghostnetv2(width=1.0, num_classes=1000, dropout=0.2, args=None) model.eval() # Dummy input tensor (adjust the shape as per your requirement) input = torch.full((32, 3, 320, 256), 10.0) # Export the model weights export_weight(model) # Evaluate the model eval_model(input, model) ================================================ FILE: ghostnet/ghostnetv2/ghostnetv2.cpp ================================================ #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" using namespace std; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != 0) { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) // Define input/output parameters static const int INPUT_H = 256; static const int INPUT_W = 320; static const int OUTPUT_SIZE = 1000; static const int batchSize = 32; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weight file std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open the weight file std::ifstream input(file); if (!input.is_open()) { std::cerr << "Unable to load weight file." << std::endl; exit(EXIT_FAILURE); } // Read the number of weights int32_t count; input >> count; if (count <= 0) { std::cerr << "Invalid weight map file." << std::endl; exit(EXIT_FAILURE); } while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read the name and size std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load weight data uint32_t* val = reinterpret_cast(malloc(sizeof(uint32_t) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } int _make_divisible(int v, int divisor, int min_value = -1) { // If min_value is not specified, set it to divisor if (min_value == -1) { min_value = divisor; } // Calculate new channel size to be divisible by divisor int new_v = std::max(min_value, (v + divisor / 2) / divisor * divisor); // Ensure rounding down does not reduce by more than 10% if (new_v < static_cast(0.9 * v)) { new_v += divisor; } return new_v; } ILayer* hardSigmoid(INetworkDefinition* network, ITensor& input) { // Apply Hard Sigmoid activation function IActivationLayer* scale_layer = network->addActivation(input, ActivationType::kHARD_SIGMOID); // Return the output after activation return scale_layer; } IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* convBnReluStem(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; // Step 1: Convolution layer IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + ".weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); // Stride of 2 conv1->setPaddingNd(DimsHW{1, 1}); // Padding of 1 // Step 2: Batch normalization layer IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); // Step 3: ReLU activation IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); return relu1; // Return the result after activation } ILayer* convBnAct(INetworkDefinition* network, std::map& weightMap, ITensor& input, int out_channels, std::string lname, ActivationType actType = ActivationType::kRELU) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; // Add convolution layer IConvolutionLayer* conv = network->addConvolutionNd(input, out_channels, DimsHW{1, 1}, weightMap[lname + ".conv.weight"], emptywts); assert(conv); conv->setStrideNd(DimsHW{1, 1}); // Add batch normalization layer IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn1", 1e-5); // Add activation layer (default is ReLU) IActivationLayer* act = network->addActivation(*bn->getOutput(0), actType); assert(act); return act; } ILayer* squeezeExcite(INetworkDefinition* network, ITensor& input, std::map& weightMap, int in_chs, float se_ratio = 0.25, std::string lname = "", float eps = 1e-5) { // Step 1: Global average pooling IReduceLayer* avg_pool = network->addReduce(input, ReduceOperation::kAVG, 1 << 2 | 1 << 3, true); assert(avg_pool); // Step 2: 1x1 convolution for dimension reduction int reduced_chs = _make_divisible(static_cast(in_chs * se_ratio), 4); IConvolutionLayer* conv_reduce = network->addConvolutionNd(*avg_pool->getOutput(0), reduced_chs, DimsHW{1, 1}, weightMap[lname + ".conv_reduce.weight"], weightMap[lname + ".conv_reduce.bias"]); assert(conv_reduce); // Step 3: ReLU activation IActivationLayer* relu1 = network->addActivation(*conv_reduce->getOutput(0), ActivationType::kRELU); assert(relu1); // Step 4: 1x1 convolution for dimension expansion IConvolutionLayer* conv_expand = network->addConvolutionNd(*relu1->getOutput(0), in_chs, DimsHW{1, 1}, weightMap[lname + ".conv_expand.weight"], weightMap[lname + ".conv_expand.bias"]); assert(conv_expand); // Step 5: Hard Sigmoid activation ILayer* hard_sigmoid = hardSigmoid(network, *conv_expand->getOutput(0)); // Step 6: Multiply input by the output of SE module IElementWiseLayer* scale = network->addElementWise(input, *hard_sigmoid->getOutput(0), ElementWiseOperation::kPROD); assert(scale); return scale; } ILayer* ghostModuleV2(INetworkDefinition* network, ITensor& input, std::map& weightMap, int inp, int oup, int kernel_size = 1, int ratio = 2, int dw_size = 3, int stride = 1, bool relu = true, std::string lname = "", std::string mode = "original") { int init_channels = std::ceil(oup / ratio); int new_channels = init_channels * (ratio - 1); // Primary convolution IConvolutionLayer* primary_conv = network->addConvolutionNd(input, init_channels, DimsHW{kernel_size, kernel_size}, weightMap[lname + ".primary_conv.0.weight"], Weights{}); primary_conv->setStrideNd(DimsHW{stride, stride}); primary_conv->setPaddingNd(DimsHW{kernel_size / 2, kernel_size / 2}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *primary_conv->getOutput(0), lname + ".primary_conv.1", 1e-5); ITensor* act1_output = bn1->getOutput(0); if (relu) { IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); act1_output = relu1->getOutput(0); } // Cheap operation IConvolutionLayer* cheap_conv = network->addConvolutionNd(*act1_output, new_channels, DimsHW{dw_size, dw_size}, weightMap[lname + ".cheap_operation.0.weight"], Weights{}); cheap_conv->setStrideNd(DimsHW{1, 1}); cheap_conv->setPaddingNd(DimsHW{dw_size / 2, dw_size / 2}); cheap_conv->setNbGroups(init_channels); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *cheap_conv->getOutput(0), lname + ".cheap_operation.1", 1e-5); ITensor* act2_output = bn2->getOutput(0); if (relu) { IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); act2_output = relu2->getOutput(0); } // Concatenate ITensor* concat_inputs[] = {act1_output, act2_output}; IConcatenationLayer* concat = network->addConcatenation(concat_inputs, 2); // Slice to oup channels Dims start{4, {0, 0, 0, 0}}; Dims size = concat->getOutput(0)->getDimensions(); size.d[1] = oup; Dims stride_{4, {1, 1, 1, 1}}; ISliceLayer* slice = network->addSlice(*concat->getOutput(0), start, size, stride_); ITensor* out = slice->getOutput(0); if (mode == "original") { return slice; } else if (mode == "attn") { // Attention mechanism // Average pooling IPoolingLayer* avg_pool = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{2, 2}); avg_pool->setStrideNd(DimsHW{2, 2}); ITensor* avg_pooled = avg_pool->getOutput(0); // Short convolution branch IConvolutionLayer* short_conv1 = network->addConvolutionNd(*avg_pooled, oup, DimsHW{kernel_size, kernel_size}, weightMap[lname + ".short_conv.0.weight"], Weights{}); short_conv1->setStrideNd(DimsHW{1, 1}); short_conv1->setPaddingNd(DimsHW{kernel_size / 2, kernel_size / 2}); IScaleLayer* short_bn1 = addBatchNorm2d(network, weightMap, *short_conv1->getOutput(0), lname + ".short_conv.1", 1e-5); // Conv with kernel size (1,5) IConvolutionLayer* short_conv2 = network->addConvolutionNd( *short_bn1->getOutput(0), oup, DimsHW{1, 5}, weightMap[lname + ".short_conv.2.weight"], Weights{}); short_conv2->setStrideNd(DimsHW{1, 1}); short_conv2->setPaddingNd(DimsHW{0, 2}); short_conv2->setNbGroups(oup); IScaleLayer* short_bn2 = addBatchNorm2d(network, weightMap, *short_conv2->getOutput(0), lname + ".short_conv.3", 1e-5); // Conv with kernel size (5,1) IConvolutionLayer* short_conv3 = network->addConvolutionNd( *short_bn2->getOutput(0), oup, DimsHW{5, 1}, weightMap[lname + ".short_conv.4.weight"], Weights{}); short_conv3->setStrideNd(DimsHW{1, 1}); short_conv3->setPaddingNd(DimsHW{2, 0}); short_conv3->setNbGroups(oup); IScaleLayer* short_bn3 = addBatchNorm2d(network, weightMap, *short_conv3->getOutput(0), lname + ".short_conv.5", 1e-5); ITensor* res = short_bn3->getOutput(0); // Sigmoid activation IActivationLayer* gate = network->addActivation(*res, ActivationType::kSIGMOID); // Upsample to the same size as out IResizeLayer* gate_upsampled = network->addResize(*gate->getOutput(0)); gate_upsampled->setResizeMode(ResizeMode::kNEAREST); Dims out_dims = out->getDimensions(); gate_upsampled->setOutputDimensions(out_dims); // Element-wise multiplication IElementWiseLayer* scaled_out = network->addElementWise(*out, *gate_upsampled->getOutput(0), ElementWiseOperation::kPROD); return scaled_out; } else { std::cerr << "Invalid mode: " << mode << " in ghostModuleV2" << std::endl; return nullptr; } } ILayer* ghostBottleneck(INetworkDefinition* network, ITensor& input, std::map& weightMap, int in_chs, int mid_chs, int out_chs, int dw_kernel_size = 3, int stride = 1, float se_ratio = 0.0f, std::string lname = "", int layer_id = 0) { // Determine mode based on layer_id std::string mode = (layer_id <= 1) ? "original" : "attn"; // ghost1 ILayer* ghost1 = ghostModuleV2(network, input, weightMap, in_chs, mid_chs, 1, 2, 3, 1, true, lname + ".ghost1", mode); ILayer* depthwise_conv = ghost1; if (stride > 1) { IConvolutionLayer* conv_dw = network->addConvolutionNd(*ghost1->getOutput(0), mid_chs, DimsHW{dw_kernel_size, dw_kernel_size}, weightMap[lname + ".conv_dw.weight"], Weights{}); conv_dw->setStrideNd(DimsHW{stride, stride}); conv_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2}); conv_dw->setNbGroups(mid_chs); IScaleLayer* bn_dw = addBatchNorm2d(network, weightMap, *conv_dw->getOutput(0), lname + ".bn_dw", 1e-5); depthwise_conv = bn_dw; } ILayer* se_layer = depthwise_conv; if (se_ratio > 0.0f) { se_layer = squeezeExcite(network, *depthwise_conv->getOutput(0), weightMap, mid_chs, se_ratio, lname + ".se"); } // ghost2 uses original mode ILayer* ghost2 = ghostModuleV2(network, *se_layer->getOutput(0), weightMap, mid_chs, out_chs, 1, 2, 3, 1, false, lname + ".ghost2", "original"); ILayer* shortcut_layer = nullptr; if (in_chs == out_chs && stride == 1) { shortcut_layer = network->addIdentity(input); } else { IConvolutionLayer* conv_shortcut_dw = network->addConvolutionNd(input, in_chs, DimsHW{dw_kernel_size, dw_kernel_size}, weightMap[lname + ".shortcut.0.weight"], Weights{}); conv_shortcut_dw->setStrideNd(DimsHW{stride, stride}); conv_shortcut_dw->setPaddingNd(DimsHW{(dw_kernel_size - 1) / 2, (dw_kernel_size - 1) / 2}); conv_shortcut_dw->setNbGroups(in_chs); IScaleLayer* bn_shortcut_dw = addBatchNorm2d(network, weightMap, *conv_shortcut_dw->getOutput(0), lname + ".shortcut.1", 1e-5); IConvolutionLayer* conv_shortcut_pw = network->addConvolutionNd(*bn_shortcut_dw->getOutput(0), out_chs, DimsHW{1, 1}, weightMap[lname + ".shortcut.2.weight"], Weights{}); IScaleLayer* bn_shortcut_pw = addBatchNorm2d(network, weightMap, *conv_shortcut_pw->getOutput(0), lname + ".shortcut.3", 1e-5); shortcut_layer = bn_shortcut_pw; } IElementWiseLayer* ew_sum = network->addElementWise(*ghost2->getOutput(0), *shortcut_layer->getOutput(0), ElementWiseOperation::kSUM); return ew_sum; } ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config, DataType dt) { // Use explicit batch mode INetworkDefinition* network = builder->createNetworkV2(1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // Create input tensor ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{batchSize, 3, INPUT_H, INPUT_W}); assert(data); // Load weights std::map weightMap = loadWeights("../ghostnetv2.weights"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // Step 1: Conv Stem IActivationLayer* conv_stem = convBnReluStem(network, weightMap, *data, 16, "conv_stem"); ILayer* current_layer = conv_stem; current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 16, 16, 3, 1, 0.0f, "blocks.0.0", 0); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 16, 48, 24, 3, 2, 0.0f, "blocks.1.0", 1); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 24, 3, 1, 0.0f, "blocks.2.0", 2); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 24, 72, 40, 5, 2, 0.25f, "blocks.3.0", 3); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 120, 40, 5, 1, 0.25f, "blocks.4.0", 4); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 40, 240, 80, 3, 2, 0.0f, "blocks.5.0", 5); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 200, 80, 3, 1, 0.0f, "blocks.6.0", 6); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0.0f, "blocks.6.1", 7); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 184, 80, 3, 1, 0.0f, "blocks.6.2", 8); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 80, 480, 112, 3, 1, 0.25f, "blocks.6.3", 9); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 112, 3, 1, 0.25f, "blocks.6.4", 10); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 112, 672, 160, 5, 2, 0.25f, "blocks.7.0", 11); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.0f, "blocks.8.0", 12); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25f, "blocks.8.1", 13); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.0f, "blocks.8.2", 14); current_layer = ghostBottleneck(network, *current_layer->getOutput(0), weightMap, 160, 960, 160, 5, 1, 0.25f, "blocks.8.3", 15); // Apply ConvBnAct current_layer = convBnAct(network, weightMap, *current_layer->getOutput(0), 960, "blocks.9.0"); // Global average pooling IReduceLayer* global_pool = network->addReduce(*current_layer->getOutput(0), ReduceOperation::kAVG, 1 << 2 | 1 << 3, true); assert(global_pool); // Conv Head IConvolutionLayer* conv_head = network->addConvolutionNd( *global_pool->getOutput(0), 1280, DimsHW{1, 1}, weightMap["conv_head.weight"], weightMap["conv_head.bias"]); IActivationLayer* act2 = network->addActivation(*conv_head->getOutput(0), ActivationType::kRELU); // Fully connected layer (classifier) IFullyConnectedLayer* classifier = network->addFullyConnected( *act2->getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]); classifier->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*classifier->getOutput(0)); // Build the engine config->setMaxWorkspaceSize(1 << 24); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); // Destroy the network network->destroy(); // Free memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model and serialize ICudaEngine* engine = createEngine(builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Release resources engine->destroy(); config->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Input and output buffers void* buffers[2]; // Create buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // Copy input data to device, execute inference, and copy output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./ghostnetv2 -s // serialize model to plan file" << std::endl; std::cerr << "./ghostnetv2 -d // deserialize plan file and run inference" << std::endl; return -1; } // Create model and serialize char* trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(&modelStream); assert(modelStream != nullptr); std::ofstream p("ghostnetv2.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file("ghostnetv2.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Allocate input and output data float* data = new float[batchSize * 3 * INPUT_H * INPUT_W]; for (int i = 0; i < batchSize * 3 * INPUT_H * INPUT_W; i++) data[i] = 10.0; float* prob = new float[batchSize * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Execute inference doInference(*context, data, prob, batchSize); // Print output results std::cout << "\nOutput:\n\n"; for (int i = 0; i < batchSize; i++) { std::cout << "Batch " << i << ":\n"; for (unsigned int j = 0; j < OUTPUT_SIZE; j++) { std::cout << prob[i * OUTPUT_SIZE + j] << ", "; if (j % 10 == 0) std::cout << j / 10 << std::endl; } std::cout << "\n"; } // Release resources context->destroy(); engine->destroy(); runtime->destroy(); delete[] data; delete[] prob; return 0; } ================================================ FILE: ghostnet/ghostnetv2/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) noexcept override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: googlenet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) project( googlenet VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 60 70 72 75 80 86 89) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) endif() add_executable(${PROJECT_NAME} googlenet.cpp) target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR} ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart TensorRT::TensorRT ${OpenCV_LIBS}) ================================================ FILE: googlenet/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: googlenet/README.md ================================================ # Googlenet ## Introduction GoogLeNet (Inception v1) model architecture from [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842). For model details, refer to code from [torchvision](https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py#L29), for generating `.wts` file, refer to [pytorchx/googlenet](https://github.com/wang-xinyu/pytorchx/tree/master/googlenet) ## Usage 1. use `gen_wts.py` to generate wts file. ```bash python3 gen_wts.py ``` 2. build C++ code ```bash pushd tensorrtx/googlenet cmake -S . -B build -G Ninja --fresh cmake --build build ``` 3. serialize wts model to engine file. ```bash ./build/googlenet -s ``` 4. run inference ```bash ./build/googlenet -i ``` output looks like: ```bash ... ==== Execution time: 637us -1.823, -0.9841, 0.6483, 0.7607, -0.4659, -1.407, -2.807, -1.175, -0.4034, -1.881, -1.267, -1.654, 0.7542, -1.777, -0.7118, -2.134, -1.542, 0.1852, -3.036, -0.5396, -0.1669, ==== prediction result: Top: 0 idx: 285, logits: 9.9, label: Egyptian cat Top: 1 idx: 281, logits: 8.304, label: tabby, tabby cat Top: 2 idx: 282, logits: 6.859, label: tiger cat ``` ================================================ FILE: googlenet/gen_wts.py ================================================ import struct import cv2 import numpy as np import torch from torchvision.models.googlenet import googlenet def read_imagenet_labels() -> dict[int, str]: """ read ImageNet 1000 labels Returns: dict[int, str]: labels dict """ clsid2label = {} with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f: for i in f.readlines(): k, v = i.split(": ") clsid2label.setdefault(int(k), v[1:-3]) return clsid2label def preprocess(img: np.array) -> torch.Tensor: """ a preprocess method align with ImageNet dataset Args: img (np.array): input image Returns: torch.Tensor: preprocessed image in `NCHW` layout """ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR) mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) img = (img - mean) / std img = img.transpose(2, 0, 1)[None, ...] return torch.from_numpy(img) def main(): labels = read_imagenet_labels() img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR) img = preprocess(img) model = googlenet(pretrained=True) with torch.inference_mode(): model = model.eval() output = model(img) for i, batch in enumerate(torch.topk(output, k=3).indices): for j, idx in enumerate(batch): print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}") print(f"{'=' * 32}") with open("../models/googlenet.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {} ".format(k, len(vr))) print(k, v.shape) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") f.close() if __name__ == "__main__": main() ================================================ FILE: googlenet/googlenet.cpp ================================================ #include #include #include #include #include #include #include "logging.h" #include "utils.h" using WeightMap = std::map; using M = nvinfer1::MatrixOperation; using E = nvinfer1::ElementWiseOperation; using NDCF = nvinfer1::NetworkDefinitionCreationFlag; static Logger gLogger; // stuff we know about googlenet static constexpr const std::size_t N = 1; static constexpr const int32_t INPUT_H = 224; static constexpr const int32_t INPUT_W = 224; static constexpr const std::array SIZES = {3 * INPUT_H * INPUT_W, 1000}; static constexpr const std::array NAMES = {"data", "prob"}; static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false; static constexpr const char* WTS_PATH = "../models/googlenet.wts"; static constexpr const char* ENGINE_PATH = "../models/googlenet.engine"; static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt"; static constexpr const std::array mean = {0.485f, 0.456f, 0.406f}; static constexpr const std::array stdv = {0.229f, 0.224f, 0.225f}; auto addBatchNorm2d(INetworkDefinition* network, WeightMap& m, ITensor& input, const std::string& lname, float eps = 1e-3) -> ILayer* { static Weights none{DataType::kFLOAT, nullptr, 0ll}; float* gamma = (float*)m[lname + ".weight"].values; float* beta = (float*)m[lname + ".bias"].values; float* mean = (float*)m[lname + ".running_mean"].values; float* var = (float*)m[lname + ".running_var"].values; int64_t len = m[lname + ".running_var"].count; auto* scval = static_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; auto* shift_val = static_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shift_val[i] = beta[i] - (mean[i] * scval[i]); } Weights shift{DataType::kFLOAT, shift_val, len}; m[lname + ".scale"] = scale; m[lname + ".shift"] = shift; m[lname + ".power"] = none; auto* bn = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, none); assert(bn); bn->setName(lname.c_str()); return bn; } /** * @brief A basic conv2d+bn+relu layer from googlenet * * @param network network definition from TensorRT API * @param weightMap weight map * @param input input tensor * @param outch output channels * @param k kernel size for convolution * @param s stride size for convolution * @param p padding size for convolution * @param lname layer name from weight map * @return ILayer* */ ILayer* basicConv2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname, int32_t outch, int k, int s = 1, int p = 0) { static const Weights none{DataType::kFLOAT, nullptr, 0ll}; auto* conv = network->addConvolutionNd(input, outch, DimsHW{k, k}, weightMap[lname + ".conv.weight"], none); auto* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn"); auto* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU); assert(conv && bn && relu); conv->setName(lname.c_str()); bn->setName((lname + ".bn").c_str()); relu->setName((lname + ".relu").c_str()); conv->setStrideNd(DimsHW{s, s}); conv->setPaddingNd(DimsHW{p, p}); return relu; } /** * @brief Inception module from googlenet implementation in torchvision, see: * https://github.com/pytorch/vision/blob/v0.24.1/torchvision/models/googlenet.py#L184 * * @param network network definition from TensorRT API * @param weightMap weight map * @param input input tensor * @param lname layer name from weight map * @param ch1x1 * @param ch3x3red * @param ch3x3 * @param ch5x5red * @param ch5x5 * @param pool_proj * @return IConcatenationLayer* */ IConcatenationLayer* inception(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname, int ch1x1, int ch3x3red, int ch3x3, int ch5x5red, int ch5x5, int pool_proj) { // "cbr" means "Conv-Batchnorm-Relu" auto* cbr1 = basicConv2d(network, weightMap, input, lname + "branch1", ch1x1, 1); auto* cbr2 = basicConv2d(network, weightMap, input, lname + "branch2.0", ch3x3red, 1); auto* cbr3 = basicConv2d(network, weightMap, *cbr2->getOutput(0), lname + "branch2.1", ch3x3, 3, 1, 1); auto* cbr4 = basicConv2d(network, weightMap, input, lname + "branch3.0", ch5x5red, 1); auto* cbr5 = basicConv2d(network, weightMap, *cbr4->getOutput(0), lname + "branch3.1", ch5x5, 3, 1, 1); auto* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3}); auto* cbr6 = basicConv2d(network, weightMap, *pool1->getOutput(0), lname + "branch4.1", pool_proj, 1); assert(cbr1 && cbr2 && cbr3 && cbr4 && cbr5 && pool1 && cbr6); pool1->setStrideNd(DimsHW{1, 1}); pool1->setPaddingNd(DimsHW{1, 1}); pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); std::array inputTensors = {cbr1->getOutput(0), cbr3->getOutput(0), cbr5->getOutput(0), cbr6->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors.data(), 4); assert(cat1); return cat1; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { WeightMap weightMap = loadWeights(WTS_PATH); #if TRT_VERSION >= 11200 auto flag = 1U << static_cast(NDCF::kSTRONGLY_TYPED); #elif TRT_VERSION >= 10000 auto flag = 0U; #else auto flag = 1U << static_cast(NDCF::kEXPLICIT_BATCH); #endif auto* network = builder->createNetworkV2(flag); ITensor* input{nullptr}; if constexpr (TRT_PREPROCESS) { dt = DataType::kUINT8; input = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3}); auto* trans = addTransformLayer(network, *input, true, mean, stdv); input = trans->getOutput(0); } else { input = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W}); } assert(input); auto* relu1 = basicConv2d(network, weightMap, *input, "conv1", 64, 7, 2, 3); auto* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); pool1->setName("pool1"); auto* relu2 = basicConv2d(network, weightMap, *pool1->getOutput(0), "conv2", 64, 1); auto* relu3 = basicConv2d(network, weightMap, *relu2->getOutput(0), "conv3", 192, 3, 1, 1); auto* pool2 = network->addPoolingNd(*relu3->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool2); pool2->setStrideNd(DimsHW{2, 2}); pool2->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); pool2->setName("pool2"); auto* cat1 = inception(network, weightMap, *pool2->getOutput(0), "inception3a.", 64, 96, 128, 16, 32, 32); auto* cat2 = inception(network, weightMap, *cat1->getOutput(0), "inception3b.", 128, 128, 192, 32, 96, 64); auto* pool3 = network->addPoolingNd(*cat2->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool3); pool3->setStrideNd(DimsHW{2, 2}); pool3->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); pool3->setName("pool3"); auto* cat3 = inception(network, weightMap, *pool3->getOutput(0), "inception4a.", 192, 96, 208, 16, 48, 64); cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4b.", 160, 112, 224, 24, 64, 64); cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4c.", 128, 128, 256, 24, 64, 64); cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4d.", 112, 144, 288, 32, 64, 64); cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception4e.", 256, 160, 320, 32, 128, 128); IPoolingLayer* pool4 = network->addPoolingNd(*cat3->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool4); pool4->setStrideNd(DimsHW{2, 2}); pool4->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); pool4->setName("pool4"); cat3 = inception(network, weightMap, *pool4->getOutput(0), "inception5a.", 256, 160, 320, 32, 128, 128); cat3 = inception(network, weightMap, *cat3->getOutput(0), "inception5b.", 384, 192, 384, 48, 128, 128); // this is a AdaptiveAvgPool2d in pytorch implementation IPoolingLayer* pool5 = network->addPoolingNd(*cat3->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); auto* shuffle = network->addShuffle(*pool5->getOutput(0)); assert(pool5 && shuffle); shuffle->setName("shuffle"); shuffle->setReshapeDimensions(Dims2{1, -1}); // "-1" means "1024" auto* fcw = network->addConstant(Dims2{1000, 1024}, weightMap["fc.weight"])->getOutput(0); auto* fcb = network->addConstant(Dims2{1, 1000}, weightMap["fc.bias"])->getOutput(0); auto* fc0 = network->addMatrixMultiply(*shuffle->getOutput(0), M::kNONE, *fcw, M::kTRANSPOSE); auto* fc1 = network->addElementWise(*fc0->getOutput(0), *fcb, E::kSUM); fc1->getOutput(0)->setName(NAMES[1]); network->markOutput(*fc1->getOutput(0)); // Build engine #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); IHostMemory* mem = builder->buildSerializedNetwork(*network, *config); ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size()); delete network; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif std::cout << "build finished\n"; // Release host memory for (auto& mem : weightMap) { free((void*)mem.second.values); } return engine; } void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } std::vector> doInference(IExecutionContext& context, void* input, int64_t batchSize) { const auto& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { std::size_t size = 0; #if TRT_VERSION >= 8000 auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batchSize * SIZES[i], std::nanf("")); std::size_t size = batchSize * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); for (auto& buffer : buffers) { CHECK(cudaFree(buffer)); } CHECK(cudaStreamDestroy(stream)); return prob; } int main(int argc, char** argv) { checkTrtEnv(); if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./googlenet -s // serialize model to plan file\n"; std::cerr << "./googlenet -d // deserialize plan file and run inference\n"; return -1; } // create a model using the API directly and serialize it to a stream IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return 1; } #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); const std::string img_path = "../assets/cats.jpg"; void* input = nullptr; std::vector flat_img; cv::Mat img = cv::imread(img_path, cv::IMREAD_COLOR); if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR); input = static_cast(img.data); } else { flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W); input = flat_img.data(); } assert(input); for (int32_t i = 0; i < 100; ++i) { auto _start = std::chrono::system_clock::now(); auto prob = doInference(*context, input, 1); auto _end = std::chrono::system_clock::now(); auto _time = std::chrono::duration_cast(_end - _start).count(); std::cout << "Execution time: " << _time << "us\n"; for (const auto& vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====\n"; break; } } } if (i == 99) { std::cout << "prediction result:\n"; auto labels = loadImagenetLabelMap(LABELS_PATH); int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << labels[idx] << "\n"; } } } delete[] trtModelStream; #if TRT_VERSION >= 8000 delete context; delete engine; delete runtime; #else context->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } ================================================ FILE: googlenet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: googlenet/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: googlenet/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static auto loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean subtract mean * @param std divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in float32 type */ static std::vector preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array& mean, const std::array& std, int n, int h, int w) { const auto c = img.channels(); const auto size = c * h * w; if (c != 3) { std::cerr << "this demo only supports 3 channel input image.\n"; std::abort(); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); img.convertTo(img, CV_32FC3, 1.f / 255); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(static_cast(n) * c * h * w, 0.f); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = v[0]; chw[i * size + 1 * h * w + y * w + x] = v[1]; chw[i * size + 2 * h * w + y * w + x] = v[2]; } } } return chw; } static auto topk(const std::vector& v, int k) -> std::vector> { if (k <= 0) return {}; auto stride = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(stride); for (auto i = 0; i < stride; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static std::map loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const std::array& mean, const std::array& std) { struct ScaleParams { std::array shift; std::array scale; }; static std::vector> gScaleParams; auto params = std::make_unique(); params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]}; params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)}; static const Weights empty{DataType::kFLOAT, nullptr, 0ll}; const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll}; const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll}; gScaleParams.emplace_back(std::move(params)); ITensor* in = &input; if (input.getType() != DataType::kFLOAT) { #if TRT_VERSION >= 8000 auto* cast = network->addCast(input, DataType::kFLOAT); assert(cast); cast->setName("Cast to FP32"); in = cast->getOutput(0); #else auto* identity = network->addIdentity(input); assert(identity); identity->setName("Convert to FP32"); identity->setOutputType(0, DataType::kFLOAT); in = identity->getOutput(0); #endif } // Convert from NHWC to NCHW auto* perm = network->addShuffle(*in); assert(perm); perm->setName("NHWC -> NCHW"); perm->setFirstTranspose(Permutation{0, 3, 1, 2}); // Convert from BGR to RGB (optional) ITensor* data{nullptr}; if (bgr2rgb) { auto add_slice = [&](int c, const char* name) -> ITensor* { auto dims = perm->getOutput(0)->getDimensions(); Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1}; Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]}; auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride); _slice->setName(name); assert(_slice && _slice->getNbOutputs() == 1); return _slice->getOutput(0); }; std::array channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")}; auto* cat = network->addConcatenation(channels.data(), 3); assert(cat); cat->setName("RGB"); cat->setAxis(1); data = cat->getOutput(0); } else { data = perm->getOutput(0); } // Normalize auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty); assert(trans); trans->setName("mean & std"); #if TRT_VERSION >= 8000 trans->setChannelAxis(1); #endif return trans; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: hrnet/hrnet-image-classification/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(hrnet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp) target_link_libraries(hrnet nvinfer) target_link_libraries(hrnet cudart) target_link_libraries(hrnet ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: hrnet/hrnet-image-classification/README.md ================================================ # HRNet The Pytorch implementation is [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification). The implemented model is **HRNet-W18-C-Small-v2** ## How to Run * 1. generate .wts Download code and model from [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification) and config your environments. Put `demo.py` in the `YOUR_ROOT_DIR\HRNet-Image-Classification\tools ` folder, set `savewts in main()` as `True`, and run, the .wts will be generated. * 2. cmake and make ``` mkdir build cd build cmake .. make sudo ./hrnet -s // serialize model to plan file i.e. 'hrnet.engine' sudo ./hrnet -d ../samples // deserialize plan file and run inference, the images in samples will be processed. ``` ## Result The test img: ![](https://user-images.githubusercontent.com/20653176/93732833-ac103200-fc05-11ea-88ff-6f59f316a377.JPEG) Pytorch Result: ![image-20200921115119593](https://user-images.githubusercontent.com/20653176/93731787-225e6580-fc01-11ea-9578-393079cd1873.png) TRT Result: ![image-20200921114959069](https://user-images.githubusercontent.com/20653176/93731788-238f9280-fc01-11ea-954f-2debc20e102a.png) ================================================ FILE: hrnet/hrnet-image-classification/common.hpp ================================================ #pragma once #include #include #include #include #include #include #include "NvInfer.h" #include "NvInferPlugin.h" #include "cuda_runtime_api.h" using namespace nvinfer1; #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; //std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, std::string convname, std::string bnname, bool bias = false) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1; //Dims dim; if (!bias) { conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[convname + ".weight"], emptywts); } else { conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[convname + ".weight"], weightMap[convname + ".bias"]); } assert(conv1); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-4); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); return lr; } IActivationLayer* ResBlock2Conv(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, inch, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); /// IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), inch, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{ stride, stride }); conv2->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); ////// IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], emptywts); assert(conv3); conv1->setStrideNd(DimsHW{ stride, stride }); conv3->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5); IElementWiseLayer* ew1; if (inch != outch) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + ".downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{ stride, stride }); conv4->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + ".downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } IActivationLayer* ResBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; // in 256 out 64 IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); /// IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{ stride, stride }); conv2->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); ////// IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), inch, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], emptywts); assert(conv3); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5); IElementWiseLayer* ew1; ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } IActivationLayer* liteResBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; // in 256 out 64 IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ 1, 1 }); conv1->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); /// IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{ 1, 1 }); conv2->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); IElementWiseLayer* ew1; ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } ILayer* netAddUpsample(INetworkDefinition* network, ITensor* input, int inputChannels, int stride){ nvinfer1::Dims inpDims = input->getDimensions(); assert(inpDims.nbDims == 3); // chw assert(inpDims.d[1] == inpDims.d[2]); int h = inpDims.d[1]; int w = inpDims.d[2]; // add pre multiply matrix as a constant /* kSPATIA Elements correspond to different spatial data. kCHANNEL Elements correspond to different channels. */ nvinfer1::Dims preDims{ 3, {1, stride * h, w}, {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL, nvinfer1::DimensionType::kSPATIAL} }; int size = stride * h * w; nvinfer1::Weights preMul{ nvinfer1::DataType::kFLOAT, nullptr, size }; float* preWt = new float[size]; /* (2*h * w) [ [1, 0, ..., 0], [1, 0, ..., 0], [0, 1, ..., 0], [0, 1, ..., 0], ..., ..., [0, 0, ..., 1], [0, 0, ..., 1] ] */ for (int i = 0, idx = 0; i < h; ++i) { for (int s = 0; s < stride; ++s) { for (int j = 0; j < w; ++j, ++idx) { preWt[idx] = (i == j) ? 1.0 : 0.0; } } } preMul.values = preWt; nvinfer1::IConstantLayer* preM = network->addConstant(preDims, preMul); assert(preM != nullptr); //std::string preLayerName = "preMul_" + std::to_string(layerIdx); //preM->setName(preLayerName.c_str()); // add post multiply matrix as a constant nvinfer1::Dims postDims{ 3, {1, h, stride * w}, {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL, nvinfer1::DimensionType::kSPATIAL} }; size = stride * h * w; nvinfer1::Weights postMul{ nvinfer1::DataType::kFLOAT, nullptr, size }; float* postWt = new float[size]; /* (h * 2*w) [ [1, 1, 0, 0, ..., 0, 0], [0, 0, 1, 1, ..., 0, 0], ..., ..., [0, 0, 0, 0, ..., 1, 1] ] */ for (int i = 0, idx = 0; i < h; ++i) { for (int j = 0; j < stride * w; ++j, ++idx) { postWt[idx] = (j / stride == i) ? 1.0 : 0.0; } } postMul.values = postWt; nvinfer1::IConstantLayer* post_m = network->addConstant(postDims, postMul); assert(post_m != nullptr); // add matrix multiply layers for upsampling nvinfer1::IMatrixMultiplyLayer* mm1 = network->addMatrixMultiply(*preM->getOutput(0), nvinfer1::MatrixOperation::kNONE, *input, nvinfer1::MatrixOperation::kNONE); assert(mm1 != nullptr); nvinfer1::IMatrixMultiplyLayer* mm2 = network->addMatrixMultiply(*mm1->getOutput(0), nvinfer1::MatrixOperation::kNONE, *post_m->getOutput(0), nvinfer1::MatrixOperation::kNONE); assert(mm2 != nullptr); return mm2; } ================================================ FILE: hrnet/hrnet-image-classification/demo.py ================================================ # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Bin Xiao (Bin.Xiao@microsoft.com) # Modified by Ke Sun (sunk@mail.ustc.edu.cn) # ------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import division from __future__ import print_function import argparse import os import sys import shutil import pprint import torch import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.optim import torch.utils.data import torch.utils.data.distributed import torchvision.datasets as datasets import torchvision.transforms as transforms import _init_paths import models from config import config from config import update_config from core.function import validate from utils.modelsummary import get_model_summary from utils.utils import create_logger from core.evaluate import accuracy import cv2 import numpy as np from PIL import Image import struct def parse_args(): parser = argparse.ArgumentParser(description='Train keypoints network') parser.add_argument('--cfg', help='experiment configure file name', default=r"E:\LearningCodes\GithubRepo\HRNet-Image-Classification\experiments\cls_hrnet_w18_small_v2_sgd_lr5e-2_wd1e-4_bs32_x100.yaml", type=str) parser.add_argument('--modelDir', help='model directory', type=str, default='') parser.add_argument('--logDir', help='log directory', type=str, default='') parser.add_argument('--dataDir', help='data directory', type=str, default='') parser.add_argument('--testModel', help='testModel', type=str, default=r'E:\LearningCodes\GithubRepo\HRNet-Image-Classification\hrnet_w18_small_model_v2.pth') parser.add_argument('--testImg', help='imgs', type=str, default=r'E:\Datasets\tiny-imagenet-200\tiny-imagenet-200\val\images\val_41.JPEG') args = parser.parse_args() update_config(config, args) return args def main(): savewts = False args = parse_args() logger, final_output_dir, tb_log_dir = create_logger( config, args.cfg, 'demo') logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED # eval() 函数用来执行一个字符串表达式,并返回表达式的值。 model = eval('models.'+config.MODEL.NAME+'.get_cls_net')( config) model.load_state_dict(torch.load(args.testModel)) if savewts: f = open('HRNetClassify.wts', 'w') f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') exit(0) # load img image = cv2.imread(args.testImg) #BGR 0-255 hwc #im = Image.open(args.testImg) #print(im.getpixel((0,0))) ## 0-255 #resize # config.MODEL.IMAGE_SIZE[0] resized_img = cv2.resize(image, (config.MODEL.IMAGE_SIZE[0], config.MODEL.IMAGE_SIZE[1])) resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) #RGB # normalize mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] inp_image = ((resized_img/255. - mean) / std).astype(np.float32) # R-0.485 B- inp_image = inp_image.transpose(2, 0, 1) # chw inp_image = torch.from_numpy(inp_image).unsqueeze(0) # to_tensor model.eval() output = model(inp_image) #print(output) _, pred = output.topk(1) pred = pred.t() print(pred) if __name__ == "__main__": main() ================================================ FILE: hrnet/hrnet-image-classification/hrnet.cpp ================================================ #include #include #include #include #include #include #include "common.hpp" #include "logging.h" static Logger gLogger; #define DEVICE 0 // GPU id #define BATCH_SIZE 1 const char* INPUT_BLOB_NAME = "image"; const char* OUTPUT_BLOB_NAME = "output"; static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); assert(data); std::map weightMap = loadWeights("E:\\LearningCodes\\GithubRepo\\HRNet-Image-Classification\\tools\\HRNetClassify.wts"); Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; auto id_993 = convBnLeaky(network, weightMap, *data, 64, 3, 2, 1, "conv1", "bn1"); //conv1.weight auto id_996 = convBnLeaky(network, weightMap, *id_993->getOutput(0), 64, 3, 2, 1, "conv2", "bn2"); //conv1.weight //Res // IActivationLayer* ResBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { auto id_1008 = ResBlock2Conv(network, weightMap, *id_996->getOutput(0), 64, 256, 1, "layer1.0"); auto id_1018 = ResBlock(network, weightMap, *id_1008->getOutput(0), 256, 64, 1, "layer1.1"); // transition1-1 auto id_1021 = convBnLeaky(network, weightMap, *id_1018->getOutput(0), 18, 3, 1, 1, "transition1.0.0", "transition1.0.1"); auto id_1031 = liteResBlock(network, weightMap, *id_1021->getOutput(0), 18, "stage2.0.branches.0.0"); auto id_1038 = liteResBlock(network, weightMap, *id_1031->getOutput(0), 18, "stage2.0.branches.0.1"); //Ҳ֧ auto id_1024 = convBnLeaky(network, weightMap, *id_1018->getOutput(0), 36, 3, 2, 1, "transition1.1.0.0", "transition1.1.0.1"); auto id_1045 = liteResBlock(network, weightMap, *id_1024->getOutput(0), 36, "stage2.0.branches.1.0"); auto id_1052 = liteResBlock(network, weightMap, *id_1045->getOutput(0), 36, "stage2.0.branches.1.1"); // conv+bn+upsample IConvolutionLayer* id_1053 = network->addConvolutionNd(*id_1052->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage2.0.fuse_layers.0.1.0.weight"], emptywts); assert(id_1053); id_1053->setStrideNd(DimsHW{ 1, 1 }); id_1053->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1054 = addBatchNorm2d(network, weightMap, *id_1053->getOutput(0), "stage2.0.fuse_layers.0.1.1", 1e-5); ILayer* id_1083 = netAddUpsample(network, id_1054->getOutput(0), 18, 2); IElementWiseLayer* id_1084 = network->addElementWise(*id_1083->getOutput(0), *id_1038->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1085 = network->addActivation(*id_1084->getOutput(0), ActivationType::kRELU); // transition1-2 IConvolutionLayer* id_1086 = network->addConvolutionNd(*id_1038->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage2.0.fuse_layers.1.0.0.0.weight"], emptywts); assert(id_1086); id_1086->setStrideNd(DimsHW{ 2, 2 }); id_1086->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1087 = addBatchNorm2d(network, weightMap, *id_1086->getOutput(0), "stage2.0.fuse_layers.1.0.0.1", 1e-5); IElementWiseLayer* id_1088 = network->addElementWise(*id_1087->getOutput(0), *id_1052->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1089 = network->addActivation(*id_1088->getOutput(0), ActivationType::kRELU); /////////////////////////////////// // transition2-1 stage_3 auto id_1099 = liteResBlock(network, weightMap, *id_1085->getOutput(0), 18, "stage3.0.branches.0.0"); auto id_1106 = liteResBlock(network, weightMap, *id_1099->getOutput(0), 18, "stage3.0.branches.0.1"); // transition2-2 stage_3 auto id_1113 = liteResBlock(network, weightMap, *id_1089->getOutput(0), 36, "stage3.0.branches.1.0"); auto id_1120 = liteResBlock(network, weightMap, *id_1113->getOutput(0), 36, "stage3.0.branches.1.1"); // transition2-3 stage_3 auto id_1092 = convBnLeaky(network, weightMap, *id_1089->getOutput(0), 72, 3, 2, 1, "transition2.2.0.0", "transition2.2.0.1"); auto id_1127 = liteResBlock(network, weightMap, *id_1092->getOutput(0), 72, "stage3.0.branches.2.0"); auto id_1134 = liteResBlock(network, weightMap, *id_1127->getOutput(0), 72, "stage3.0.branches.2.1"); /////// ֱģ ܼ //conv bn up IConvolutionLayer* id_1135 = network->addConvolutionNd(*id_1120->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.0.fuse_layers.0.1.0.weight"], emptywts); assert(id_1135); id_1135->setStrideNd(DimsHW{ 1, 1 }); id_1135->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1136 = addBatchNorm2d(network, weightMap, *id_1135->getOutput(0), "stage3.0.fuse_layers.0.1.1", 1e-5); ILayer* id_1165 = netAddUpsample(network, id_1136->getOutput(0), 18, 2); IElementWiseLayer* id_1166 = network->addElementWise(*id_1165->getOutput(0), *id_1106->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1167 = network->addConvolutionNd(*id_1134->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.0.fuse_layers.0.2.0.weight"], emptywts); assert(id_1167); id_1167->setStrideNd(DimsHW{ 1, 1 }); id_1167->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1168 = addBatchNorm2d(network, weightMap, *id_1167->getOutput(0), "stage3.0.fuse_layers.0.2.1", 1e-5); ILayer* id_1197 = netAddUpsample(network, id_1168->getOutput(0), 18, 4); IElementWiseLayer* id_1198 = network->addElementWise(*id_1166->getOutput(0), *id_1197->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1199 = network->addActivation(*id_1198->getOutput(0), ActivationType::kRELU); //2 IConvolutionLayer* id_1200 = network->addConvolutionNd(*id_1106->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.1.0.0.0.weight"], emptywts); assert(id_1200); id_1200->setStrideNd(DimsHW{ 2, 2 }); id_1200->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1201 = addBatchNorm2d(network, weightMap, *id_1200->getOutput(0), "stage3.0.fuse_layers.1.0.0.1", 1e-5); IElementWiseLayer* id_1202 = network->addElementWise(*id_1201->getOutput(0), *id_1120->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1203 = network->addConvolutionNd(*id_1134->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage3.0.fuse_layers.1.2.0.weight"], emptywts); assert(id_1203); id_1203->setStrideNd(DimsHW{ 1, 1 }); id_1203->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1204 = addBatchNorm2d(network, weightMap, *id_1203->getOutput(0), "stage3.0.fuse_layers.1.2.1", 1e-5); ILayer* id_1233 = netAddUpsample(network, id_1204->getOutput(0), 36, 2); IElementWiseLayer* id_1234 = network->addElementWise(*id_1202->getOutput(0), *id_1233->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1235 = network->addActivation(*id_1234->getOutput(0), ActivationType::kRELU); // 3 IConvolutionLayer* id_1236 = network->addConvolutionNd(*id_1106->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.2.0.0.0.weight"], emptywts); assert(id_1236); id_1236->setStrideNd(DimsHW{ 2, 2 }); id_1236->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1237 = addBatchNorm2d(network, weightMap, *id_1236->getOutput(0), "stage3.0.fuse_layers.2.0.0.1", 1e-5); IActivationLayer* id_1238 = network->addActivation(*id_1237->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1239 = network->addConvolutionNd(*id_1238->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.2.0.1.0.weight"], emptywts); assert(id_1239); id_1239->setStrideNd(DimsHW{ 2, 2 }); id_1239->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1240 = addBatchNorm2d(network, weightMap, *id_1239->getOutput(0), "stage3.0.fuse_layers.2.0.1.1", 1e-5); IConvolutionLayer* id_1241 = network->addConvolutionNd(*id_1120->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.0.fuse_layers.2.1.0.0.weight"], emptywts); assert(id_1241); id_1241->setStrideNd(DimsHW{ 2, 2 }); id_1241->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1242 = addBatchNorm2d(network, weightMap, *id_1241->getOutput(0), "stage3.0.fuse_layers.2.1.0.1", 1e-5); IElementWiseLayer* id_1243 = network->addElementWise(*id_1240->getOutput(0), *id_1242->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_1244 = network->addElementWise(*id_1243->getOutput(0), *id_1134->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1245 = network->addActivation(*id_1244->getOutput(0), ActivationType::kRELU); auto id_1252 = liteResBlock(network, weightMap, *id_1199->getOutput(0), 18, "stage3.1.branches.0.0"); auto id_1259 = liteResBlock(network, weightMap, *id_1252->getOutput(0), 18, "stage3.1.branches.0.1"); auto id_1266 = liteResBlock(network, weightMap, *id_1235->getOutput(0), 36, "stage3.1.branches.1.0"); auto id_1273 = liteResBlock(network, weightMap, *id_1266->getOutput(0), 36, "stage3.1.branches.1.1"); auto id_1280 = liteResBlock(network, weightMap, *id_1245->getOutput(0), 72, "stage3.1.branches.2.0"); auto id_1287 = liteResBlock(network, weightMap, *id_1280->getOutput(0), 72, "stage3.1.branches.2.1"); /////// ֱģ ܼ //1: 1259+up(1273)+up(1287) IConvolutionLayer* id_1288 = network->addConvolutionNd(*id_1273->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.1.fuse_layers.0.1.0.weight"], emptywts); assert(id_1288); id_1288->setStrideNd(DimsHW{ 1, 1 }); id_1288->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1289 = addBatchNorm2d(network, weightMap, *id_1288->getOutput(0), "stage3.1.fuse_layers.0.1.1", 1e-5); ILayer* id_1318 = netAddUpsample(network, id_1289->getOutput(0), 18, 2); IElementWiseLayer* id_1319 = network->addElementWise(*id_1259->getOutput(0), *id_1318->getOutput(0), ElementWiseOperation::kSUM); //1-2 up(1287) conv bn up IConvolutionLayer* id_1320 = network->addConvolutionNd(*id_1134->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.1.fuse_layers.0.2.0.weight"], emptywts); assert(id_1320); id_1320->setStrideNd(DimsHW{ 1, 1 }); id_1320->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1321 = addBatchNorm2d(network, weightMap, *id_1320->getOutput(0), "stage3.1.fuse_layers.0.2.1", 1e-5); ILayer* id_1350 = netAddUpsample(network, id_1321->getOutput(0), 18, 4); IElementWiseLayer* id_1351 = network->addElementWise(*id_1319->getOutput(0), *id_1350->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1352 = network->addActivation(*id_1351->getOutput(0), ActivationType::kRELU); //2: conv(1259)+1273 + up(1287) IConvolutionLayer* id_1353 = network->addConvolutionNd(*id_1259->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.1.0.0.0.weight"], emptywts); assert(id_1353); id_1353->setStrideNd(DimsHW{ 2, 2 }); id_1353->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1354 = addBatchNorm2d(network, weightMap, *id_1353->getOutput(0), "stage3.1.fuse_layers.1.0.0.1", 1e-5); IElementWiseLayer* id_1355 = network->addElementWise(*id_1354->getOutput(0), *id_1273->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1356 = network->addConvolutionNd(*id_1287->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage3.1.fuse_layers.1.2.0.weight"], emptywts); assert(id_1356); id_1356->setStrideNd(DimsHW{ 1, 1 }); id_1356->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1357 = addBatchNorm2d(network, weightMap, *id_1356->getOutput(0), "stage3.1.fuse_layers.1.2.1", 1e-5); ILayer* id_1386 = netAddUpsample(network, id_1357->getOutput(0), 36, 2); IElementWiseLayer* id_1387 = network->addElementWise(*id_1355->getOutput(0), *id_1386->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1388 = network->addActivation(*id_1387->getOutput(0), ActivationType::kRELU); //3 conv(1259)+conv(1273)+1287 IConvolutionLayer* id_1389 = network->addConvolutionNd(*id_1259->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.2.0.0.0.weight"], emptywts); assert(id_1389); id_1389->setStrideNd(DimsHW{ 2, 2 }); id_1389->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1390 = addBatchNorm2d(network, weightMap, *id_1389->getOutput(0), "stage3.1.fuse_layers.2.0.0.1", 1e-5); IActivationLayer* id_1391 = network->addActivation(*id_1390->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1392 = network->addConvolutionNd(*id_1391->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.2.0.1.0.weight"], emptywts); assert(id_1392); id_1392->setStrideNd(DimsHW{ 2, 2 }); id_1392->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1393 = addBatchNorm2d(network, weightMap, *id_1392->getOutput(0), "stage3.1.fuse_layers.2.0.1.1", 1e-5); IConvolutionLayer* id_1394 = network->addConvolutionNd(*id_1273->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.1.fuse_layers.2.1.0.0.weight"], emptywts); assert(id_1394); id_1394->setStrideNd(DimsHW{ 2, 2 }); id_1394->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1395 = addBatchNorm2d(network, weightMap, *id_1394->getOutput(0), "stage3.1.fuse_layers.2.1.0.1", 1e-5); IElementWiseLayer* id_1396 = network->addElementWise(*id_1393->getOutput(0), *id_1395->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_1397 = network->addElementWise(*id_1396->getOutput(0), *id_1287->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1398 = network->addActivation(*id_1397->getOutput(0), ActivationType::kRELU); auto id_1405 = liteResBlock(network, weightMap, *id_1352->getOutput(0), 18, "stage3.2.branches.0.0"); auto id_1412 = liteResBlock(network, weightMap, *id_1405->getOutput(0), 18, "stage3.2.branches.0.1"); auto id_1419 = liteResBlock(network, weightMap, *id_1388->getOutput(0), 36, "stage3.2.branches.1.0"); auto id_1426 = liteResBlock(network, weightMap, *id_1419->getOutput(0), 36, "stage3.2.branches.1.1"); auto id_1433 = liteResBlock(network, weightMap, *id_1398->getOutput(0), 72, "stage3.2.branches.2.0"); auto id_1440 = liteResBlock(network, weightMap, *id_1433->getOutput(0), 72, "stage3.2.branches.2.1"); // 1412 + up(1426)+up(1440) IConvolutionLayer* id_1441 = network->addConvolutionNd(*id_1426->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.2.fuse_layers.0.1.0.weight"], emptywts); assert(id_1441); id_1441->setStrideNd(DimsHW{ 1, 1 }); id_1441->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1442 = addBatchNorm2d(network, weightMap, *id_1441->getOutput(0), "stage3.2.fuse_layers.0.1.1", 1e-5); ILayer* id_1471 = netAddUpsample(network, id_1442->getOutput(0), 18, 2); IElementWiseLayer* id_1472 = network->addElementWise(*id_1412->getOutput(0), *id_1471->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1473 = network->addConvolutionNd(*id_1440->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage3.2.fuse_layers.0.2.0.weight"], emptywts); assert(id_1473); id_1473->setStrideNd(DimsHW{ 1, 1 }); id_1473->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1474 = addBatchNorm2d(network, weightMap, *id_1473->getOutput(0), "stage3.2.fuse_layers.0.2.1", 1e-5); ILayer* id_1503 = netAddUpsample(network, id_1474->getOutput(0), 18, 4); IElementWiseLayer* id_1504 = network->addElementWise(*id_1472->getOutput(0), *id_1503->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1505 = network->addActivation(*id_1504->getOutput(0), ActivationType::kRELU); // conv(1412)+1426+up(1440) IConvolutionLayer* id_1506 = network->addConvolutionNd(*id_1412->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.1.0.0.0.weight"], emptywts); assert(id_1506); id_1506->setStrideNd(DimsHW{ 2, 2 }); id_1506->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1507 = addBatchNorm2d(network, weightMap, *id_1506->getOutput(0), "stage3.2.fuse_layers.1.0.0.1", 1e-5); IElementWiseLayer* id_1508 = network->addElementWise(*id_1507->getOutput(0), *id_1426->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1509 = network->addConvolutionNd(*id_1440->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage3.2.fuse_layers.1.2.0.weight"], emptywts); assert(id_1509); id_1509->setStrideNd(DimsHW{ 1, 1 }); id_1509->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1510 = addBatchNorm2d(network, weightMap, *id_1509->getOutput(0), "stage3.2.fuse_layers.1.2.1", 1e-5); ILayer* id_1539 = netAddUpsample(network, id_1510->getOutput(0), 36, 2); IElementWiseLayer* id_1540 = network->addElementWise(*id_1508->getOutput(0), *id_1539->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1541 = network->addActivation(*id_1540->getOutput(0), ActivationType::kRELU); // conv(1412)+conv(1426)+1440 IConvolutionLayer* id_1542 = network->addConvolutionNd(*id_1412->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.2.0.0.0.weight"], emptywts); assert(id_1542); id_1542->setStrideNd(DimsHW{ 2, 2 }); id_1542->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1543 = addBatchNorm2d(network, weightMap, *id_1542->getOutput(0), "stage3.2.fuse_layers.2.0.0.1", 1e-5); IActivationLayer* id_1544 = network->addActivation(*id_1543->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1545 = network->addConvolutionNd(*id_1544->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.2.0.1.0.weight"], emptywts); assert(id_1545); id_1545->setStrideNd(DimsHW{ 2, 2 }); id_1545->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1546 = addBatchNorm2d(network, weightMap, *id_1545->getOutput(0), "stage3.2.fuse_layers.2.0.1.1", 1e-5); IConvolutionLayer* id_1547 = network->addConvolutionNd(*id_1426->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage3.2.fuse_layers.2.1.0.0.weight"], emptywts); assert(id_1547); id_1547->setStrideNd(DimsHW{ 2, 2 }); id_1547->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1548 = addBatchNorm2d(network, weightMap, *id_1547->getOutput(0), "stage3.2.fuse_layers.2.1.0.1", 1e-5); IElementWiseLayer* id_1549 = network->addElementWise(*id_1546->getOutput(0), *id_1548->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_1550 = network->addElementWise(*id_1549->getOutput(0), *id_1440->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1551 = network->addActivation(*id_1550->getOutput(0), ActivationType::kRELU); auto id_1561 = liteResBlock(network, weightMap, *id_1505->getOutput(0), 18, "stage4.0.branches.0.0"); auto id_1568 = liteResBlock(network, weightMap, *id_1561->getOutput(0), 18, "stage4.0.branches.0.1"); auto id_1575 = liteResBlock(network, weightMap, *id_1541->getOutput(0), 36, "stage4.0.branches.1.0"); auto id_1582 = liteResBlock(network, weightMap, *id_1575->getOutput(0), 36, "stage4.0.branches.1.1"); auto id_1589 = liteResBlock(network, weightMap, *id_1551->getOutput(0), 72, "stage4.0.branches.2.0"); auto id_1596 = liteResBlock(network, weightMap, *id_1589->getOutput(0), 72, "stage4.0.branches.2.1"); // transition auto id_1554 = convBnLeaky(network, weightMap, *id_1551->getOutput(0), 144, 3, 2, 1, "transition3.3.0.0", "transition3.3.0.1"); auto id_1603 = liteResBlock(network, weightMap, *id_1554->getOutput(0), 144, "stage4.0.branches.3.0"); auto id_1610 = liteResBlock(network, weightMap, *id_1603->getOutput(0), 144, "stage4.0.branches.3.1"); // 1568+up(1582)+up(1596)+up(1610) IConvolutionLayer* id_1611 = network->addConvolutionNd(*id_1582->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.0.1.0.weight"], emptywts); assert(id_1611); id_1611->setStrideNd(DimsHW{ 1, 1 }); id_1611->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1612 = addBatchNorm2d(network, weightMap, *id_1611->getOutput(0), "stage4.0.fuse_layers.0.1.1", 1e-5); ILayer* id_1641 = netAddUpsample(network, id_1612->getOutput(0), 18, 2); IElementWiseLayer* id_1642 = network->addElementWise(*id_1641->getOutput(0), *id_1568->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1643 = network->addConvolutionNd(*id_1596->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.0.2.0.weight"], emptywts); assert(id_1643); id_1643->setStrideNd(DimsHW{ 1, 1 }); id_1643->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1644 = addBatchNorm2d(network, weightMap, *id_1643->getOutput(0), "stage4.0.fuse_layers.0.2.1", 1e-5); ILayer* id_1673 = netAddUpsample(network, id_1644->getOutput(0), 18, 4); IElementWiseLayer* id_1674 = network->addElementWise(*id_1642->getOutput(0), *id_1673->getOutput(0), ElementWiseOperation::kSUM); //3 IConvolutionLayer* id_1675 = network->addConvolutionNd(*id_1610->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.0.3.0.weight"], emptywts); assert(id_1675); id_1675->setStrideNd(DimsHW{ 1, 1 }); id_1675->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1676 = addBatchNorm2d(network, weightMap, *id_1675->getOutput(0), "stage4.0.fuse_layers.0.3.1", 1e-5); ILayer* id_1705 = netAddUpsample(network, id_1676->getOutput(0), 18, 8); IElementWiseLayer* id_1706 = network->addElementWise(*id_1705->getOutput(0), *id_1674->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1707 = network->addActivation(*id_1706->getOutput(0), ActivationType::kRELU); // conv(1568)+1582+up(1596)+up(1610) IConvolutionLayer* id_1708 = network->addConvolutionNd(*id_1568->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.1.0.0.0.weight"], emptywts); assert(id_1708); id_1708->setStrideNd(DimsHW{ 2, 2 }); id_1708->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1709 = addBatchNorm2d(network, weightMap, *id_1708->getOutput(0), "stage4.0.fuse_layers.1.0.0.1", 1e-5); IElementWiseLayer* id_1710 = network->addElementWise(*id_1709->getOutput(0), *id_1582->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1711 = network->addConvolutionNd(*id_1596->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.1.2.0.weight"], emptywts); assert(id_1711); id_1711->setStrideNd(DimsHW{ 1, 1 }); id_1711->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1712 = addBatchNorm2d(network, weightMap, *id_1711->getOutput(0), "stage4.0.fuse_layers.1.2.1", 1e-5); ILayer* id_1741 = netAddUpsample(network, id_1712->getOutput(0), 36, 2); IElementWiseLayer* id_1742 = network->addElementWise(*id_1741->getOutput(0), *id_1710->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1743 = network->addConvolutionNd(*id_1610->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.1.3.0.weight"], emptywts); assert(id_1743); id_1743->setStrideNd(DimsHW{ 1, 1 }); id_1743->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1744 = addBatchNorm2d(network, weightMap, *id_1743->getOutput(0), "stage4.0.fuse_layers.1.3.1", 1e-5); ILayer* id_1773 = netAddUpsample(network, id_1744->getOutput(0), 36, 4); IElementWiseLayer* id_1774 = network->addElementWise(*id_1773->getOutput(0), *id_1742->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1775 = network->addActivation(*id_1774->getOutput(0), ActivationType::kRELU); // conv(1568)+conv(1582)+1596+up(1610) IConvolutionLayer* id_1776 = network->addConvolutionNd(*id_1568->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.2.0.0.0.weight"], emptywts); assert(id_1776); id_1776->setStrideNd(DimsHW{ 2, 2 }); id_1776->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1777 = addBatchNorm2d(network, weightMap, *id_1776->getOutput(0), "stage4.0.fuse_layers.2.0.0.1", 1e-5); IActivationLayer* id_1778 = network->addActivation(*id_1777->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1779 = network->addConvolutionNd(*id_1778->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.2.0.1.0.weight"], emptywts); assert(id_1779); id_1779->setStrideNd(DimsHW{ 2, 2 }); id_1779->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1780 = addBatchNorm2d(network, weightMap, *id_1779->getOutput(0), "stage4.0.fuse_layers.2.0.1.1", 1e-5); IConvolutionLayer* id_1781 = network->addConvolutionNd(*id_1582->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.2.1.0.0.weight"], emptywts); assert(id_1781); id_1781->setStrideNd(DimsHW{ 2, 2 }); id_1781->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1782 = addBatchNorm2d(network, weightMap, *id_1781->getOutput(0), "stage4.0.fuse_layers.2.1.0.1", 1e-5); IElementWiseLayer* id_1783 = network->addElementWise(*id_1780->getOutput(0), *id_1782->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_1784 = network->addElementWise(*id_1783->getOutput(0), *id_1596->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1785 = network->addConvolutionNd(*id_1610->getOutput(0), 72, DimsHW{ 1, 1 }, weightMap["stage4.0.fuse_layers.2.3.0.weight"], emptywts); assert(id_1785); id_1785->setStrideNd(DimsHW{ 1, 1 }); id_1785->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1786 = addBatchNorm2d(network, weightMap, *id_1785->getOutput(0), "stage4.0.fuse_layers.2.3.1", 1e-5); ILayer* id_1815 = netAddUpsample(network, id_1786->getOutput(0), 72, 2); IElementWiseLayer* id_1816 = network->addElementWise(*id_1784->getOutput(0), *id_1815->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1817 = network->addActivation(*id_1816->getOutput(0), ActivationType::kRELU); // conv(1568)+conv(1582)+conv(1596)+(1610) // 1568(cbr)1820(cbr)1823(cb)1825 IConvolutionLayer* id_1818 = network->addConvolutionNd(*id_1568->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.0.0.0.weight"], emptywts); assert(id_1818); id_1818->setStrideNd(DimsHW{ 2, 2 }); id_1818->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1819 = addBatchNorm2d(network, weightMap, *id_1818->getOutput(0), "stage4.0.fuse_layers.3.0.0.1", 1e-5); IActivationLayer* id_1820 = network->addActivation(*id_1819->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1821 = network->addConvolutionNd(*id_1820->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.0.1.0.weight"], emptywts); assert(id_1821); id_1821->setStrideNd(DimsHW{ 2, 2 }); id_1821->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1822 = addBatchNorm2d(network, weightMap, *id_1821->getOutput(0), "stage4.0.fuse_layers.3.0.1.1", 1e-5); IActivationLayer* id_1823 = network->addActivation(*id_1822->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1824 = network->addConvolutionNd(*id_1823->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.0.2.0.weight"], emptywts); assert(id_1824); id_1824->setStrideNd(DimsHW{ 2, 2 }); id_1824->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1825 = addBatchNorm2d(network, weightMap, *id_1824->getOutput(0), "stage4.0.fuse_layers.3.0.2.1", 1e-5); // 1582(cbr)1828(cb)1830 IConvolutionLayer* id_1826 = network->addConvolutionNd(*id_1582->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.1.0.0.weight"], emptywts); assert(id_1826); id_1826->setStrideNd(DimsHW{ 2, 2 }); id_1826->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1827 = addBatchNorm2d(network, weightMap, *id_1826->getOutput(0), "stage4.0.fuse_layers.3.1.0.1", 1e-5); IActivationLayer* id_1828 = network->addActivation(*id_1827->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_1829 = network->addConvolutionNd(*id_1828->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.1.1.0.weight"], emptywts); assert(id_1829); id_1829->setStrideNd(DimsHW{ 2, 2 }); id_1829->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1830 = addBatchNorm2d(network, weightMap, *id_1829->getOutput(0), "stage4.0.fuse_layers.3.1.1.1", 1e-5); IElementWiseLayer* id_1831 = network->addElementWise(*id_1830->getOutput(0), *id_1825->getOutput(0), ElementWiseOperation::kSUM); // 1596(cb)1832 IConvolutionLayer* id_1832 = network->addConvolutionNd(*id_1596->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.0.fuse_layers.3.2.0.0.weight"], emptywts); assert(id_1832); id_1832->setStrideNd(DimsHW{ 2, 2 }); id_1832->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1833 = addBatchNorm2d(network, weightMap, *id_1832->getOutput(0), "stage4.0.fuse_layers.3.2.0.1", 1e-5); IElementWiseLayer* id_1834 = network->addElementWise(*id_1833->getOutput(0), *id_1831->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_1835 = network->addElementWise(*id_1834->getOutput(0), *id_1610->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1836 = network->addActivation(*id_1835->getOutput(0), ActivationType::kRELU); auto id_1843 = liteResBlock(network, weightMap, *id_1707->getOutput(0), 18, "stage4.1.branches.0.0"); auto id_1850 = liteResBlock(network, weightMap, *id_1843->getOutput(0), 18, "stage4.1.branches.0.1"); auto id_1857 = liteResBlock(network, weightMap, *id_1775->getOutput(0), 36, "stage4.1.branches.1.0"); auto id_1864 = liteResBlock(network, weightMap, *id_1857->getOutput(0), 36, "stage4.1.branches.1.1"); auto id_1871 = liteResBlock(network, weightMap, *id_1817->getOutput(0), 72, "stage4.1.branches.2.0"); auto id_1878 = liteResBlock(network, weightMap, *id_1871->getOutput(0), 72, "stage4.1.branches.2.1"); auto id_1885 = liteResBlock(network, weightMap, *id_1836->getOutput(0), 144, "stage4.1.branches.3.0"); auto id_1892 = liteResBlock(network, weightMap, *id_1885->getOutput(0), 144, "stage4.1.branches.3.1"); // 1850+up1864+up1878+up1892 IConvolutionLayer* id_1893 = network->addConvolutionNd(*id_1864->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.0.1.0.weight"], emptywts); assert(id_1893); id_1893->setStrideNd(DimsHW{ 1, 1 }); id_1893->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1894 = addBatchNorm2d(network, weightMap, *id_1893->getOutput(0), "stage4.1.fuse_layers.0.1.1", 1e-5); ILayer* id_1923 = netAddUpsample(network, id_1894->getOutput(0), 18, 2); IElementWiseLayer* id_1924 = network->addElementWise(*id_1850->getOutput(0), *id_1923->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1925 = network->addConvolutionNd(*id_1878->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.0.2.0.weight"], emptywts); assert(id_1925); id_1925->setStrideNd(DimsHW{ 1, 1 }); id_1925->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1926 = addBatchNorm2d(network, weightMap, *id_1925->getOutput(0), "stage4.1.fuse_layers.0.2.1", 1e-5); ILayer* id_1955 = netAddUpsample(network, id_1926->getOutput(0), 18, 4); IElementWiseLayer* id_1956 = network->addElementWise(*id_1924->getOutput(0), *id_1955->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1957 = network->addConvolutionNd(*id_1892->getOutput(0), 18, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.0.3.0.weight"], emptywts); assert(id_1957); id_1957->setStrideNd(DimsHW{ 1, 1 }); id_1957->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1958 = addBatchNorm2d(network, weightMap, *id_1957->getOutput(0), "stage4.1.fuse_layers.0.3.1", 1e-5); ILayer* id_1987 = netAddUpsample(network, id_1958->getOutput(0), 18, 8); IElementWiseLayer* id_1988 = network->addElementWise(*id_1956->getOutput(0), *id_1987->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_1989 = network->addActivation(*id_1988->getOutput(0), ActivationType::kRELU); // conv1850+1864+up1878+up1892 IConvolutionLayer* id_1990 = network->addConvolutionNd(*id_1850->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.1.0.0.0.weight"], emptywts); assert(id_1990); id_1990->setStrideNd(DimsHW{ 2, 2 }); id_1990->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_1991 = addBatchNorm2d(network, weightMap, *id_1990->getOutput(0), "stage4.1.fuse_layers.1.0.0.1", 1e-5); IElementWiseLayer* id_1992 = network->addElementWise(*id_1991->getOutput(0), *id_1864->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_1993 = network->addConvolutionNd(*id_1878->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.1.2.0.weight"], emptywts); assert(id_1993); id_1993->setStrideNd(DimsHW{ 1, 1 }); id_1993->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_1994 = addBatchNorm2d(network, weightMap, *id_1993->getOutput(0), "stage4.1.fuse_layers.1.2.1", 1e-5); ILayer* id_2023 = netAddUpsample(network, id_1994->getOutput(0), 36, 2); IElementWiseLayer* id_2024 = network->addElementWise(*id_1992->getOutput(0), *id_2023->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_2025 = network->addConvolutionNd(*id_1892->getOutput(0), 36, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.1.3.0.weight"], emptywts); assert(id_2025); id_2025->setStrideNd(DimsHW{ 1, 1 }); id_2025->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_2026 = addBatchNorm2d(network, weightMap, *id_2025->getOutput(0), "stage4.1.fuse_layers.1.3.1", 1e-5); ILayer* id_2055 = netAddUpsample(network, id_2026->getOutput(0), 36, 4); IElementWiseLayer* id_2056 = network->addElementWise(*id_2024->getOutput(0), *id_2055->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_2057 = network->addActivation(*id_2056->getOutput(0), ActivationType::kRELU); //conv1850 + conv 1864 + 1878 + up1892 IConvolutionLayer* id_2058 = network->addConvolutionNd(*id_1850->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.2.0.0.0.weight"], emptywts); assert(id_2058); id_2058->setStrideNd(DimsHW{ 2, 2 }); id_2058->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2059 = addBatchNorm2d(network, weightMap, *id_2058->getOutput(0), "stage4.1.fuse_layers.2.0.0.1", 1e-5); IActivationLayer* id_2060 = network->addActivation(*id_2059->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_2061 = network->addConvolutionNd(*id_2060->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.2.0.1.0.weight"], emptywts); assert(id_2061); id_2061->setStrideNd(DimsHW{ 2, 2 }); id_2061->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2062 = addBatchNorm2d(network, weightMap, *id_2061->getOutput(0), "stage4.1.fuse_layers.2.0.1.1", 1e-5); IConvolutionLayer* id_2063 = network->addConvolutionNd(*id_1864->getOutput(0), 72, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.2.1.0.0.weight"], emptywts); assert(id_2063); id_2063->setStrideNd(DimsHW{ 2, 2 }); id_2063->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2064 = addBatchNorm2d(network, weightMap, *id_2063->getOutput(0), "stage4.1.fuse_layers.2.1.0.1", 1e-5); IElementWiseLayer* id_2065 = network->addElementWise(*id_2062->getOutput(0), *id_2064->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_2066 = network->addElementWise(*id_1878->getOutput(0), *id_2065->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_2067 = network->addConvolutionNd(*id_1892->getOutput(0), 72, DimsHW{ 1, 1 }, weightMap["stage4.1.fuse_layers.2.3.0.weight"], emptywts); assert(id_2067); id_2067->setStrideNd(DimsHW{ 1, 1 }); id_2067->setPaddingNd(DimsHW{ 0, 0 }); IScaleLayer* id_2068 = addBatchNorm2d(network, weightMap, *id_2067->getOutput(0), "stage4.1.fuse_layers.2.3.1", 1e-5); ILayer* id_2097 = netAddUpsample(network, id_2068->getOutput(0), 72, 2); IElementWiseLayer* id_2098 = network->addElementWise(*id_2097->getOutput(0), *id_2066->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_2099 = network->addActivation(*id_2098->getOutput(0), ActivationType::kRELU); // conv1850+conv1864+conv1878+1892 IConvolutionLayer* id_2100 = network->addConvolutionNd(*id_1850->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.0.0.0.weight"], emptywts); assert(id_2100); id_2100->setStrideNd(DimsHW{ 2, 2 }); id_2100->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2101 = addBatchNorm2d(network, weightMap, *id_2100->getOutput(0), "stage4.1.fuse_layers.3.0.0.1", 1e-5); IActivationLayer* id_2102 = network->addActivation(*id_2101->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_2103 = network->addConvolutionNd(*id_2102->getOutput(0), 18, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.0.1.0.weight"], emptywts); assert(id_2103); id_2103->setStrideNd(DimsHW{ 2, 2 }); id_2103->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2104 = addBatchNorm2d(network, weightMap, *id_2103->getOutput(0), "stage4.1.fuse_layers.3.0.1.1", 1e-5); IActivationLayer* id_2105 = network->addActivation(*id_2104->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_2106 = network->addConvolutionNd(*id_2105->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.0.2.0.weight"], emptywts); assert(id_2106); id_2106->setStrideNd(DimsHW{ 2, 2 }); id_2106->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2107 = addBatchNorm2d(network, weightMap, *id_2106->getOutput(0), "stage4.1.fuse_layers.3.0.2.1", 1e-5); // IConvolutionLayer* id_2108 = network->addConvolutionNd(*id_1864->getOutput(0), 36, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.1.0.0.weight"], emptywts); assert(id_2108); id_2108->setStrideNd(DimsHW{ 2, 2 }); id_2108->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2109 = addBatchNorm2d(network, weightMap, *id_2108->getOutput(0), "stage4.1.fuse_layers.3.1.0.1", 1e-5); IActivationLayer* id_2110 = network->addActivation(*id_2109->getOutput(0), ActivationType::kRELU); IConvolutionLayer* id_2111 = network->addConvolutionNd(*id_2110->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.1.1.0.weight"], emptywts); assert(id_2111); id_2111->setStrideNd(DimsHW{ 2, 2 }); id_2111->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2112 = addBatchNorm2d(network, weightMap, *id_2111->getOutput(0), "stage4.1.fuse_layers.3.1.1.1", 1e-5); IElementWiseLayer* id_2113 = network->addElementWise(*id_2107->getOutput(0), *id_2112->getOutput(0), ElementWiseOperation::kSUM); IConvolutionLayer* id_2114 = network->addConvolutionNd(*id_1878->getOutput(0), 144, DimsHW{ 3, 3 }, weightMap["stage4.1.fuse_layers.3.2.0.0.weight"], emptywts); assert(id_2114); id_2114->setStrideNd(DimsHW{ 2, 2 }); id_2114->setPaddingNd(DimsHW{ 1, 1 }); IScaleLayer* id_2115 = addBatchNorm2d(network, weightMap, *id_2114->getOutput(0), "stage4.1.fuse_layers.3.2.0.1", 1e-5); IElementWiseLayer* id_2116 = network->addElementWise(*id_2113->getOutput(0), *id_2115->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* id_2117 = network->addElementWise(*id_2116->getOutput(0), *id_1892->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer* id_2118 = network->addActivation(*id_2117->getOutput(0), ActivationType::kRELU); //res auto id_2174 = ResBlock2Conv(network, weightMap, *id_2118->getOutput(0), 256, 1024, 1, "incre_modules.3.0"); auto id_2158 = ResBlock2Conv(network, weightMap, *id_2099->getOutput(0), 128, 512, 1, "incre_modules.2.0"); auto id_2142 = ResBlock2Conv(network, weightMap, *id_2057->getOutput(0), 64, 256, 1, "incre_modules.1.0"); auto id_2130 = ResBlock2Conv(network, weightMap, *id_1989->getOutput(0), 32, 128, 1, "incre_modules.0.0"); auto id_2145 = convBnLeaky(network, weightMap, *id_2130->getOutput(0), 256, 3, 2, 1, "downsamp_modules.0.0", "downsamp_modules.0.1", true); IElementWiseLayer* id_2146 = network->addElementWise(*id_2145->getOutput(0), *id_2142->getOutput(0), ElementWiseOperation::kSUM); auto id_2161 = convBnLeaky(network, weightMap, *id_2146->getOutput(0), 512, 3, 2, 1, "downsamp_modules.1.0", "downsamp_modules.1.1", true); IElementWiseLayer* id_2162 = network->addElementWise(*id_2161->getOutput(0), *id_2158->getOutput(0), ElementWiseOperation::kSUM); auto id_2177 = convBnLeaky(network, weightMap, *id_2162->getOutput(0), 1024, 3, 2, 1, "downsamp_modules.2.0", "downsamp_modules.2.1", true); IElementWiseLayer* id_2178 = network->addElementWise(*id_2177->getOutput(0), *id_2174->getOutput(0), ElementWiseOperation::kSUM); auto id_2181 = convBnLeaky(network, weightMap, *id_2178->getOutput(0), 2048, 1, 1, 0, "final_layer.0", "final_layer.1", true); // y = F.avg_pool2d(y, kernel_size=y.size()[2:]).view(y.size(0), -1) auto pool = network->addPoolingNd(*id_2181->getOutput(0), PoolingType::kAVERAGE, DimsHW{ 7, 7 }); pool->setPaddingNd(DimsHW{ 0, 0 }); pool->setStrideNd(DimsHW{ 1, 1 }); // self.classifier = nn.Linear(2048, 1000) IFullyConnectedLayer* out = network->addFullyConnected(*pool->getOutput(0), 1000, weightMap["classifier.weight"], weightMap["classifier.bias"]); assert(out); out->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*out->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize((1 << 30)); // 1G #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{ nullptr }; size_t size{ 0 }; std::string engine_name = "hrnet.engine"; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{ nullptr }; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov5 -s // serialize model to plan file" << std::endl; std::cerr << "./yolov5 -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; /* mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] inp_image = ((resized_img/255. - mean) / std).astype(np.float32) */ int fcount = 0; for (int f = 0; f < (int)file_names.size(); f++) { fcount++; if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); // BGR if (img.empty()) continue; // cv::Mat pr_img = preprocess_img(img); // letterbox BGR to RGB cv::Mat pr_img; cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H)); int i = 0; for (int row = 0; row < INPUT_H; ++row) { uchar* uc_pixel = pr_img.data + row * pr_img.step; for (int col = 0; col < INPUT_W; ++col) { data[b * 3 * INPUT_H * INPUT_W + i] = ((float)uc_pixel[2] / 255.0 - 0.485) / 0.229; // R-0.485 data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = ((float)uc_pixel[1] / 255.0 - 0.456) / 0.224; data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = ((float)uc_pixel[0] / 255.0 - 0.406) / 0.225; uc_pixel += 3; ++i; } } } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << "infer time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; float maxp = 0; int index = 0; for (int b = 0; b < fcount; b++) { for (int j = 0; j < 1000; ++j) { float p = prob[b * OUTPUT_SIZE + j]; if (p > maxp) { maxp = p; index = j; } } } std::cout << "out index: " << index << std::endl; } } ================================================ FILE: hrnet/hrnet-image-classification/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: hrnet/hrnet-semantic-segmentation/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(hrnetseg) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp) target_link_libraries(hrnet nvinfer) target_link_libraries(hrnet cudart) target_link_libraries(hrnet ${OpenCV_LIBS}) add_executable(hrnet_ocr ${PROJECT_SOURCE_DIR}/hrnet_ocr.cpp) target_link_libraries(hrnet_ocr nvinfer) target_link_libraries(hrnet_ocr cudart) target_link_libraries(hrnet_ocr ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: hrnet/hrnet-semantic-segmentation/README.md ================================================ # HRNet-Semantic-Segmentation This repo implemtents [HRNet-Semantic-Segmentation-v1.1](https://github.com/HRNet/HRNet-Semantic-Segmentation/tree/pytorch-v1.1) and [HRNet-Semantic-Segmentation-OCR](https://github.com/HRNet/HRNet-Semantic-Segmentation/tree/HRNet-OCR). ## How to Run ### For HRNet-Semantic-Segmentation-v1.1 1. generate .wts, use config `experiments/cityscapes/seg_hrnet_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` and pretrained weight `hrnet_w48_cityscapes_cls19_1024x2048_trainset.pth` as example. change `PRETRAINED` in `experiments/cityscapes/seg_hrnet_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` to `""`. ``` cp gen_wts.py $HRNET--Semantic-Segmentation-PROJECT-ROOT/tools cd $HRNET--Semantic-Segmentation-PROJECT-ROOT python tools/gen_wts.py --cfg experiments/cityscapes/seg_hrnet_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml --ckpt_path hrnet_w48_cityscapes_cls19_1024x2048_trainset.pth --save_path hrnet_w48.wts cp hrnet_w48.wts $HRNET-TENSORRT-ROOT cd $HRNET-TENSORRT-ROOT ``` 2. cmake and make ``` mkdir build cd build cmake .. make ``` first serialize model to plan file ``` ./hrnet -s [.wts] [.engine] [small or 18 or 32 or 48] # small for W18-Small-v2, 18 for W18, etc. ``` such as ``` ./hrnet -s ../hrnet_w48.wts ./hrnet_w48.engine 48 ``` then deserialize plan file and run inference ``` ./hrnet -d [.engine] [image dir] ``` such as ``` ./hrnet -d ./hrnet_w48.engine ../samples ``` ### For HRNet-Semantic-Segmentation-OCR 1. generate .wts, use config `experiments/cityscapes/seg_hrnet_ocr_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` and pretrained weight `hrnet_ocr_cs_8162_torch11.pth` as example. change `PRETRAINED` in `experiments/cityscapes/seg_hrnet_ocr_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml` to `""`. ``` cp gen_wts.py $HRNET-OCR-TRAIN-PROJECT-ROOT/tools cd $HRNET-OCR-PROJECT-ROOT python tools/gen_wts.py --cfg experiments/cityscapes/seg_hrnet_ocr_w48_train_512x1024_sgd_lr1e-2_wd5e-4_bs_12_epoch484.yaml --ckpt_path hrnet_ocr_cs_8162_torch11.pth --save_path hrnet_ocr_w48.wts cp hrnet_ocr_w48.wts $HRNET-OCR-TENSORRT-ROOT cd $HRNET-OCR-TENSORRT-ROOT ``` 2. cmake and make ``` mkdir build cd build cmake .. make ``` first serialize model to plan file ``` ./hrnet_ocr -s [.wts] [.engine] [18 or 32 or 48] ``` such as ``` ./hrnet_ocr -s ../hrnet_ocr_w48.wts ./hrnet_ocr_w48.engine 48 ``` then deserialize plan file and run inference ``` ./hrnet_ocr -d [.engine] [image dir] ``` such as ``` ./hrnet_ocr -d ./hrnet_ocr_w48.engine ../samples ``` ## Result TRT Result: ![trtcity](https://user-images.githubusercontent.com/20653176/103136469-a68e2080-46fb-11eb-9f05-06bad81c74b9.png) pytorch result: ![image-20201225171224159](https://user-images.githubusercontent.com/20653176/103131619-6cf9ed00-46dc-11eb-9369-4374abb65744.png) ## Note * Some source codes are changed for simplicity. But the original model can still be used. All "upsample" op in source code are changed to `mode='bilinear', align_corners=True` * Image preprocessing operation and postprocessing operation are put into Trt Engine. * Zero-copy technology (CPU/GPU memory copy) is used. ================================================ FILE: hrnet/hrnet-semantic-segmentation/common.hpp ================================================ #pragma once #include #include #include #include #include #include #include "NvInfer.h" #include "NvInferPlugin.h" #include "cuda_runtime_api.h" using namespace nvinfer1; #define CHECK(status) \ do \ { \ auto ret = (status); \ if (ret != 0) \ { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // TensorRT weight files have a simple space delimited format: // [type] [size] void debug_print(ITensor *input_tensor, std::string head) { std::cout << head << " : "; for (int i = 0; i < input_tensor->getDimensions().nbDims; i++) { std::cout << input_tensor->getDimensions().d[i] << " "; } std::cout << std::endl; } std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } cv::Mat createLTU(int len) { cv::Mat lookUpTable(1, 256, CV_8U); uchar *p = lookUpTable.data; for (int j = 0; j < 256; ++j) { p[j] = (j * (256 / len) > 255) ? uchar(255) : (uchar)(j * (256 / len)); } return lookUpTable; } ITensor *MeanStd(INetworkDefinition *network, ITensor *input, float *mean, float *std, bool div255) { if (div255) { Weights Div_225{DataType::kFLOAT, nullptr, 3}; float *wgt = reinterpret_cast(malloc(sizeof(float) * 3)); for (int i = 0; i < 3; ++i) { wgt[i] = 255.0f; } Div_225.values = wgt; IConstantLayer *d = network->addConstant(Dims3{3, 1, 1}, Div_225); input = network->addElementWise(*input, *d->getOutput(0), ElementWiseOperation::kDIV)->getOutput(0); } Weights Mean{DataType::kFLOAT, nullptr, 3}; Mean.values = mean; IConstantLayer *m = network->addConstant(Dims3{3, 1, 1}, Mean); IElementWiseLayer *sub_mean = network->addElementWise(*input, *m->getOutput(0), ElementWiseOperation::kSUB); if (std != nullptr) { Weights Std{DataType::kFLOAT, nullptr, 3}; Std.values = std; IConstantLayer *s = network->addConstant(Dims3{3, 1, 1}, Std); IElementWiseLayer *std_mean = network->addElementWise(*sub_mean->getOutput(0), *s->getOutput(0), ElementWiseOperation::kDIV); return std_mean->getOutput(0); } else { return sub_mean->getOutput(0); } } IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map &weightMap, ITensor &input, std::string lname, float eps) { float *gamma = (float *)weightMap[lname + ".weight"].values; float *beta = (float *)weightMap[lname + ".bias"].values; float *mean = (float *)weightMap[lname + ".running_mean"].values; float *var = (float *)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; //std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer *scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer *convBnRelu(INetworkDefinition *network, std::map &weightMap, ITensor &input, int outch, int ksize, int s, int p, std::string convname, std::string bnname, bool relu = true, bool bias = false) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer *conv1; //Dims dim; if (!bias) { conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], emptywts); } else { conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], weightMap[convname + ".bias"]); } assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); debug_print(conv1->getOutput(0), convname); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-5); debug_print(bn1->getOutput(0), bnname); if (relu) { auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); return lr; } return bn1; } IActivationLayer *ResBlock2Conv(INetworkDefinition *network, std::map &weightMap, ITensor &input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer *conv1 = network->addConvolutionNd(input, inch, DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{stride, stride}); conv1->setPaddingNd(DimsHW{0, 0}); debug_print(conv1->getOutput(0), lname + "_1"); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); /// IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), inch, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); debug_print(conv2->getOutput(0), lname + "_2"); IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); IActivationLayer *relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); ////// IConvolutionLayer *conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + ".conv3.weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{stride, stride}); conv3->setPaddingNd(DimsHW{0, 0}); debug_print(conv3->getOutput(0), lname + "_3"); IScaleLayer *bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5); IElementWiseLayer *ew1; if (inch != outch) { IConvolutionLayer *conv4 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + ".downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); conv4->setPaddingNd(DimsHW{0, 0}); debug_print(conv4->getOutput(0), lname + "_4"); IScaleLayer *bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + ".downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer *relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } IActivationLayer *ResBlock(INetworkDefinition *network, std::map &weightMap, ITensor &input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; // in 256 out 64 IConvolutionLayer *conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{stride, stride}); conv1->setPaddingNd(DimsHW{0, 0}); debug_print(conv1->getOutput(0), lname + "_1"); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); /// IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); debug_print(conv2->getOutput(0), lname + "_2"); IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); IActivationLayer *relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); ////// IConvolutionLayer *conv3 = network->addConvolutionNd(*relu2->getOutput(0), inch, DimsHW{1, 1}, weightMap[lname + ".conv3.weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{stride, stride}); conv3->setPaddingNd(DimsHW{0, 0}); debug_print(conv3->getOutput(0), lname + "_3"); IScaleLayer *bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + ".bn3", 1e-5); IElementWiseLayer *ew1; ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); IActivationLayer *relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } IActivationLayer *liteResBlock(INetworkDefinition *network, std::map &weightMap, ITensor &input, int outch, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; // in 256 out 64 IConvolutionLayer *conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + ".conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{1, 1}); conv1->setPaddingNd(DimsHW{1, 1}); debug_print(conv1->getOutput(0), lname + "_1"); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); /// IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + ".conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{1, 1}); conv2->setPaddingNd(DimsHW{1, 1}); debug_print(conv2->getOutput(0), lname + "_2"); IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); IElementWiseLayer *ew1; ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM); debug_print(ew1->getOutput(0), lname + "_add"); IActivationLayer *relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } ILayer *convBnAddRelu(INetworkDefinition *network, std::map &weightMap, ITensor &input, ITensor &addinput, int outch, int ksize, int s, int p, std::string convname, std::string bnname, bool bias = false) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer *conv1; //Dims dim; if (!bias) { conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], emptywts); } else { conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], weightMap[convname + ".bias"]); } assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); debug_print(conv1->getOutput(0), convname); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); debug_print(lr->getOutput(0), convname + "_add"); return lr; } ILayer *netAddUpsampleBi(INetworkDefinition *network, ITensor *input, Dims outdims) { // Bi + True IResizeLayer *upSample = network->addResize(*input); upSample->setResizeMode(ResizeMode::kLINEAR); upSample->setOutputDimensions(outdims); upSample->setAlignCorners(true); // tips! return upSample; } IElementWiseLayer *convBnUpAdd(INetworkDefinition *network, std::map &weightMap, ITensor &input, ITensor &addinput, int outch, int ksize, int s, int p, std::string convname, std::string bnname, bool upsample, bool bias = false) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer *conv1; if (!bias) { conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], emptywts); } else { conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[convname + ".weight"], weightMap[convname + ".bias"]); } assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); debug_print(conv1->getOutput(0), convname + "_1"); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), bnname, 1e-5); if (!upsample) { IElementWiseLayer *add = network->addElementWise(*bn1->getOutput(0), addinput, ElementWiseOperation::kSUM); debug_print(add->getOutput(0), convname + "_add"); return add; } else { nvinfer1::Dims dim = addinput.getDimensions(); ILayer *up = netAddUpsampleBi(network, bn1->getOutput(0), dim); IElementWiseLayer *add = network->addElementWise(*up->getOutput(0), addinput, ElementWiseOperation::kSUM); debug_print(conv1->getOutput(0), convname + "_1"); //auto lr = network->addActivation(*add->getOutput(0), ActivationType::kRELU); return add; } } ================================================ FILE: hrnet/hrnet-semantic-segmentation/gen_wts.py ================================================ import argparse import struct import _init_paths import models import torch from config import config, update_config def parse_args(): parser = argparse.ArgumentParser(description="Train keypoints network") parser.add_argument("--cfg", help="experiment configure file name", type=str) parser.add_argument("--ckpt_path", help="checkpoint path", required=True, type=str) parser.add_argument("--save_path", help=".wts path", required=True, type=str) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() update_config(config, args) return args def main(): args = parse_args() model = eval("models." + config.MODEL.NAME + ".get_seg_model")(config) print("=> loading model from {}".format(args.ckpt_path)) pretrained_dict = torch.load(args.ckpt_path, map_location="cpu") model_dict = model.state_dict() pretrained_dict = { k[6:]: v for k, v in pretrained_dict.items() if k[6:] in model_dict.keys() } for k, _ in pretrained_dict.items(): print("=> loading {} from pretrained model".format(k)) model_dict.update(pretrained_dict) model.load_state_dict(model_dict) print("=> saving {} ".format(args.save_path)) f = open(args.save_path, "w") f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {} ".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") f.close() if __name__ == "__main__": main() ================================================ FILE: hrnet/hrnet-semantic-segmentation/hrnet.cpp ================================================ #include #include #include #include #include #include #include "common.hpp" #include "logging.h" static Logger gLogger; #define USE_FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 const char *INPUT_BLOB_NAME = "data"; const char *OUTPUT_BLOB_NAME = "output"; static const int INPUT_H = 512; static const int INPUT_W = 1024; static const int NUM_CLASSES = 19; static const int OUTPUT_SIZE = INPUT_H * INPUT_W; // Creat the engine using only the API and not any parser. ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string wtsPath, int width) { INetworkDefinition *network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_H, INPUT_W, 3}); assert(data); // hwc to chw auto ps = network->addShuffle(*data); ps->setFirstTranspose(nvinfer1::Permutation{2, 0, 1}); float mean[3] = {0.485, 0.456, 0.406}; float std[3] = {0.229, 0.224, 0.225}; ITensor *preinput = MeanStd(network, ps->getOutput(0), mean, std, true); std::map weightMap = loadWeights(wtsPath); auto relu_2 = convBnRelu(network, weightMap, *preinput, 64, 3, 2, 1, "conv1", "bn1"); auto relu_5 = convBnRelu(network, weightMap, *relu_2->getOutput(0), 64, 3, 2, 1, "conv2", "bn2"); auto relu_17 = ResBlock2Conv(network, weightMap, *relu_5->getOutput(0), 64, 256, 1, "layer1.0"); auto relu_27 = ResBlock(network, weightMap, *relu_17->getOutput(0), 256, 64, 1, "layer1.1"); auto relu_37 = ResBlock(network, weightMap, *relu_27->getOutput(0), 256, 64, 1, "layer1.2"); auto relu_47 = ResBlock(network, weightMap, *relu_37->getOutput(0), 256, 64, 1, "layer1.3"); auto relu_50 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width, 3, 1, 1, "transition1.0.0", "transition1.0.1"); auto relu_60 = liteResBlock(network, weightMap, *relu_50->getOutput(0), width, "stage2.0.branches.0.0"); auto relu_67 = liteResBlock(network, weightMap, *relu_60->getOutput(0), width, "stage2.0.branches.0.1"); auto relu_74 = liteResBlock(network, weightMap, *relu_67->getOutput(0), width, "stage2.0.branches.0.2"); auto relu_81 = liteResBlock(network, weightMap, *relu_74->getOutput(0), width, "stage2.0.branches.0.3"); auto relu_53 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width * 2, 3, 2, 1, "transition1.1.0.0", "transition1.1.0.1"); auto relu_88 = liteResBlock(network, weightMap, *relu_53->getOutput(0), width * 2, "stage2.0.branches.1.0"); auto relu_95 = liteResBlock(network, weightMap, *relu_88->getOutput(0), width * 2, "stage2.0.branches.1.1"); auto relu_102 = liteResBlock(network, weightMap, *relu_95->getOutput(0), width * 2, "stage2.0.branches.1.2"); auto relu_109 = liteResBlock(network, weightMap, *relu_102->getOutput(0), width * 2, "stage2.0.branches.1.3"); auto add_131 = convBnUpAdd(network, weightMap, *relu_109->getOutput(0), *relu_81->getOutput(0), width, 1, 1, 0, "stage2.0.fuse_layers.0.1.0", "stage2.0.fuse_layers.0.1.1", true); auto relu_132 = network->addActivation(*add_131->getOutput(0), ActivationType::kRELU); auto add_135 = convBnUpAdd(network, weightMap, *relu_81->getOutput(0), *relu_109->getOutput(0), width * 2, 3, 2, 1, "stage2.0.fuse_layers.1.0.0.0", "stage2.0.fuse_layers.1.0.0.1", false); auto relu_136 = network->addActivation(*add_135->getOutput(0), ActivationType::kRELU); auto relu_146 = liteResBlock(network, weightMap, *relu_132->getOutput(0), width, "stage3.0.branches.0.0"); auto relu_153 = liteResBlock(network, weightMap, *relu_146->getOutput(0), width, "stage3.0.branches.0.1"); auto relu_160 = liteResBlock(network, weightMap, *relu_153->getOutput(0), width, "stage3.0.branches.0.2"); auto relu_167 = liteResBlock(network, weightMap, *relu_160->getOutput(0), width, "stage3.0.branches.0.3"); auto relu_174 = liteResBlock(network, weightMap, *relu_136->getOutput(0), width * 2, "stage3.0.branches.1.0"); auto relu_181 = liteResBlock(network, weightMap, *relu_174->getOutput(0), width * 2, "stage3.0.branches.1.1"); auto relu_188 = liteResBlock(network, weightMap, *relu_181->getOutput(0), width * 2, "stage3.0.branches.1.2"); auto relu_195 = liteResBlock(network, weightMap, *relu_188->getOutput(0), width * 2, "stage3.0.branches.1.3"); auto relu_139 = convBnRelu(network, weightMap, *relu_136->getOutput(0), width * 4, 3, 2, 1, "transition2.2.0.0", "transition2.2.0.1"); auto relu_202 = liteResBlock(network, weightMap, *relu_139->getOutput(0), width * 4, "stage3.0.branches.2.0"); auto relu_209 = liteResBlock(network, weightMap, *relu_202->getOutput(0), width * 4, "stage3.0.branches.2.1"); auto relu_216 = liteResBlock(network, weightMap, *relu_209->getOutput(0), width * 4, "stage3.0.branches.2.2"); auto relu_223 = liteResBlock(network, weightMap, *relu_216->getOutput(0), width * 4, "stage3.0.branches.2.3"); auto add_245 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *relu_167->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.1.0", "stage3.0.fuse_layers.0.1.1", true); auto add_267 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_245->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.2.0", "stage3.0.fuse_layers.0.2.1", true); auto relu_268 = network->addActivation(*add_267->getOutput(0), ActivationType::kRELU); auto add_271 = convBnUpAdd(network, weightMap, *relu_167->getOutput(0), *relu_195->getOutput(0), width * 2, 3, 2, 1, "stage3.0.fuse_layers.1.0.0.0", "stage3.0.fuse_layers.1.0.0.1", false); auto add_293 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_271->getOutput(0), width * 2, 1, 1, 0, "stage3.0.fuse_layers.1.2.0", "stage3.0.fuse_layers.1.2.1", true); auto relu_294 = network->addActivation(*add_293->getOutput(0), ActivationType::kRELU); auto relu_297 = convBnRelu(network, weightMap, *relu_167->getOutput(0), width, 3, 2, 1, "stage3.0.fuse_layers.2.0.0.0", "stage3.0.fuse_layers.2.0.0.1"); auto bn_299 = convBnRelu(network, weightMap, *relu_297->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.0.1.0", "stage3.0.fuse_layers.2.0.1.1", false); auto add_302 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *bn_299->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.1.0.0", "stage3.0.fuse_layers.2.1.0.1", false); auto add_303 = network->addElementWise(*add_302->getOutput(0), *relu_223->getOutput(0), ElementWiseOperation::kSUM); auto relu_304 = network->addActivation(*add_303->getOutput(0), ActivationType::kRELU); auto relu_311 = liteResBlock(network, weightMap, *relu_268->getOutput(0), width, "stage3.1.branches.0.0"); auto relu_318 = liteResBlock(network, weightMap, *relu_311->getOutput(0), width, "stage3.1.branches.0.1"); auto relu_325 = liteResBlock(network, weightMap, *relu_318->getOutput(0), width, "stage3.1.branches.0.2"); auto relu_332 = liteResBlock(network, weightMap, *relu_325->getOutput(0), width, "stage3.1.branches.0.3"); auto relu_339 = liteResBlock(network, weightMap, *relu_294->getOutput(0), width * 2, "stage3.1.branches.1.0"); auto relu_346 = liteResBlock(network, weightMap, *relu_339->getOutput(0), width * 2, "stage3.1.branches.1.1"); auto relu_353 = liteResBlock(network, weightMap, *relu_346->getOutput(0), width * 2, "stage3.1.branches.1.2"); auto relu_360 = liteResBlock(network, weightMap, *relu_353->getOutput(0), width * 2, "stage3.1.branches.1.3"); auto relu_367 = liteResBlock(network, weightMap, *relu_304->getOutput(0), width * 4, "stage3.1.branches.2.0"); auto relu_374 = liteResBlock(network, weightMap, *relu_367->getOutput(0), width * 4, "stage3.1.branches.2.1"); auto relu_381 = liteResBlock(network, weightMap, *relu_374->getOutput(0), width * 4, "stage3.1.branches.2.2"); auto relu_388 = liteResBlock(network, weightMap, *relu_381->getOutput(0), width * 4, "stage3.1.branches.2.3"); auto add_410 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *relu_332->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.1.0", "stage3.1.fuse_layers.0.1.1", true); auto add_432 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_410->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.2.0", "stage3.1.fuse_layers.0.2.1", true); auto relu_433 = network->addActivation(*add_432->getOutput(0), ActivationType::kRELU); auto add_436 = convBnUpAdd(network, weightMap, *relu_332->getOutput(0), *relu_360->getOutput(0), width * 2, 3, 2, 1, "stage3.1.fuse_layers.1.0.0.0", "stage3.1.fuse_layers.1.0.0.1", false); auto add_458 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_436->getOutput(0), width * 2, 1, 1, 0, "stage3.1.fuse_layers.1.2.0", "stage3.1.fuse_layers.1.2.1", true); auto relu_459 = network->addActivation(*add_458->getOutput(0), ActivationType::kRELU); auto relu_462 = convBnRelu(network, weightMap, *relu_332->getOutput(0), width, 3, 2, 1, "stage3.1.fuse_layers.2.0.0.0", "stage3.1.fuse_layers.2.0.0.1"); auto bn_464 = convBnRelu(network, weightMap, *relu_462->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.0.1.0", "stage3.1.fuse_layers.2.0.1.1", false); auto add_467 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *bn_464->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.1.0.0", "stage3.1.fuse_layers.2.1.0.1", false); auto add_468 = network->addElementWise(*add_467->getOutput(0), *relu_388->getOutput(0), ElementWiseOperation::kSUM); auto relu_469 = network->addActivation(*add_468->getOutput(0), ActivationType::kRELU); auto relu_476 = liteResBlock(network, weightMap, *relu_433->getOutput(0), width, "stage3.2.branches.0.0"); auto relu_483 = liteResBlock(network, weightMap, *relu_476->getOutput(0), width, "stage3.2.branches.0.1"); auto relu_490 = liteResBlock(network, weightMap, *relu_483->getOutput(0), width, "stage3.2.branches.0.2"); auto relu_497 = liteResBlock(network, weightMap, *relu_490->getOutput(0), width, "stage3.2.branches.0.3"); auto relu_504 = liteResBlock(network, weightMap, *relu_459->getOutput(0), width * 2, "stage3.2.branches.1.0"); auto relu_511 = liteResBlock(network, weightMap, *relu_504->getOutput(0), width * 2, "stage3.2.branches.1.1"); auto relu_518 = liteResBlock(network, weightMap, *relu_511->getOutput(0), width * 2, "stage3.2.branches.1.2"); auto relu_525 = liteResBlock(network, weightMap, *relu_518->getOutput(0), width * 2, "stage3.2.branches.1.3"); auto relu_532 = liteResBlock(network, weightMap, *relu_469->getOutput(0), width * 4, "stage3.2.branches.2.0"); auto relu_539 = liteResBlock(network, weightMap, *relu_532->getOutput(0), width * 4, "stage3.2.branches.2.1"); auto relu_546 = liteResBlock(network, weightMap, *relu_539->getOutput(0), width * 4, "stage3.2.branches.2.2"); auto relu_553 = liteResBlock(network, weightMap, *relu_546->getOutput(0), width * 4, "stage3.2.branches.2.3"); auto add_575 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *relu_497->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.1.0", "stage3.2.fuse_layers.0.1.1", true); auto add_597 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_575->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.2.0", "stage3.2.fuse_layers.0.2.1", true); auto relu_598 = network->addActivation(*add_597->getOutput(0), ActivationType::kRELU); auto add_601 = convBnUpAdd(network, weightMap, *relu_497->getOutput(0), *relu_525->getOutput(0), width * 2, 3, 2, 1, "stage3.2.fuse_layers.1.0.0.0", "stage3.2.fuse_layers.1.0.0.1", false); auto add_623 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_601->getOutput(0), width * 2, 1, 1, 0, "stage3.2.fuse_layers.1.2.0", "stage3.2.fuse_layers.1.2.1", true); auto relu_624 = network->addActivation(*add_623->getOutput(0), ActivationType::kRELU); auto relu_627 = convBnRelu(network, weightMap, *relu_497->getOutput(0), width, 3, 2, 1, "stage3.2.fuse_layers.2.0.0.0", "stage3.2.fuse_layers.2.0.0.1"); auto bn_629 = convBnRelu(network, weightMap, *relu_627->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.0.1.0", "stage3.2.fuse_layers.2.0.1.1", false); auto add_632 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *bn_629->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.1.0.0", "stage3.2.fuse_layers.2.1.0.1", false); auto add_633 = network->addElementWise(*relu_553->getOutput(0), *add_632->getOutput(0), ElementWiseOperation::kSUM); auto relu_634 = network->addActivation(*add_633->getOutput(0), ActivationType::kRELU); auto relu_641 = liteResBlock(network, weightMap, *relu_598->getOutput(0), width, "stage3.3.branches.0.0"); auto relu_648 = liteResBlock(network, weightMap, *relu_641->getOutput(0), width, "stage3.3.branches.0.1"); auto relu_655 = liteResBlock(network, weightMap, *relu_648->getOutput(0), width, "stage3.3.branches.0.2"); auto relu_662 = liteResBlock(network, weightMap, *relu_655->getOutput(0), width, "stage3.3.branches.0.3"); auto relu_669 = liteResBlock(network, weightMap, *relu_624->getOutput(0), width * 2, "stage3.3.branches.1.0"); auto relu_676 = liteResBlock(network, weightMap, *relu_669->getOutput(0), width * 2, "stage3.3.branches.1.1"); auto relu_683 = liteResBlock(network, weightMap, *relu_676->getOutput(0), width * 2, "stage3.3.branches.1.2"); auto relu_690 = liteResBlock(network, weightMap, *relu_683->getOutput(0), width * 2, "stage3.3.branches.1.3"); auto relu_697 = liteResBlock(network, weightMap, *relu_634->getOutput(0), width * 4, "stage3.3.branches.2.0"); auto relu_704 = liteResBlock(network, weightMap, *relu_697->getOutput(0), width * 4, "stage3.3.branches.2.1"); auto relu_711 = liteResBlock(network, weightMap, *relu_704->getOutput(0), width * 4, "stage3.3.branches.2.2"); auto relu_718 = liteResBlock(network, weightMap, *relu_711->getOutput(0), width * 4, "stage3.3.branches.2.3"); auto add_740 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *relu_662->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.1.0", "stage3.3.fuse_layers.0.1.1", true); auto add_762 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_740->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.2.0", "stage3.3.fuse_layers.0.2.1", true); auto relu_763 = network->addActivation(*add_762->getOutput(0), ActivationType::kRELU); auto add_766 = convBnUpAdd(network, weightMap, *relu_662->getOutput(0), *relu_690->getOutput(0), width * 2, 3, 2, 1, "stage3.3.fuse_layers.1.0.0.0", "stage3.3.fuse_layers.1.0.0.1", false); auto add_788 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_766->getOutput(0), width * 2, 1, 1, 0, "stage3.3.fuse_layers.1.2.0", "stage3.3.fuse_layers.1.2.1", true); auto relu_789 = network->addActivation(*add_788->getOutput(0), ActivationType::kRELU); auto relu_792 = convBnRelu(network, weightMap, *relu_662->getOutput(0), width, 3, 2, 1, "stage3.3.fuse_layers.2.0.0.0", "stage3.3.fuse_layers.2.0.0.1"); auto bn_794 = convBnRelu(network, weightMap, *relu_792->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.0.1.0", "stage3.3.fuse_layers.2.0.1.1", false); auto add_797 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *bn_794->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.1.0.0", "stage3.3.fuse_layers.2.1.0.1", false); auto add_798 = network->addElementWise(*relu_718->getOutput(0), *add_797->getOutput(0), ElementWiseOperation::kSUM); auto relu_799 = network->addActivation(*add_798->getOutput(0), ActivationType::kRELU); auto relu_809 = liteResBlock(network, weightMap, *relu_763->getOutput(0), width, "stage4.0.branches.0.0"); auto relu_816 = liteResBlock(network, weightMap, *relu_809->getOutput(0), width, "stage4.0.branches.0.1"); auto relu_823 = liteResBlock(network, weightMap, *relu_816->getOutput(0), width, "stage4.0.branches.0.2"); auto relu_830 = liteResBlock(network, weightMap, *relu_823->getOutput(0), width, "stage4.0.branches.0.3"); auto relu_837 = liteResBlock(network, weightMap, *relu_789->getOutput(0), width * 2, "stage4.0.branches.1.0"); auto relu_844 = liteResBlock(network, weightMap, *relu_837->getOutput(0), width * 2, "stage4.0.branches.1.1"); auto relu_851 = liteResBlock(network, weightMap, *relu_844->getOutput(0), width * 2, "stage4.0.branches.1.2"); auto relu_858 = liteResBlock(network, weightMap, *relu_851->getOutput(0), width * 2, "stage4.0.branches.1.3"); auto relu_865 = liteResBlock(network, weightMap, *relu_799->getOutput(0), width * 4, "stage4.0.branches.2.0"); auto relu_872 = liteResBlock(network, weightMap, *relu_865->getOutput(0), width * 4, "stage4.0.branches.2.1"); auto relu_879 = liteResBlock(network, weightMap, *relu_872->getOutput(0), width * 4, "stage4.0.branches.2.2"); auto relu_886 = liteResBlock(network, weightMap, *relu_879->getOutput(0), width * 4, "stage4.0.branches.2.3"); //======== auto relu_802 = convBnRelu(network, weightMap, *relu_799->getOutput(0), width * 8, 3, 2, 1, "transition3.3.0.0", "transition3.3.0.1"); auto relu_893 = liteResBlock(network, weightMap, *relu_802->getOutput(0), width * 8, "stage4.0.branches.3.0"); auto relu_900 = liteResBlock(network, weightMap, *relu_893->getOutput(0), width * 8, "stage4.0.branches.3.1"); auto relu_907 = liteResBlock(network, weightMap, *relu_900->getOutput(0), width * 8, "stage4.0.branches.3.2"); auto relu_914 = liteResBlock(network, weightMap, *relu_907->getOutput(0), width * 8, "stage4.0.branches.3.3"); auto add_936 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *relu_830->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.1.0", "stage4.0.fuse_layers.0.1.1", true); auto add_958 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_936->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.2.0", "stage4.0.fuse_layers.0.2.1", true); auto add_980 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_958->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.3.0", "stage4.0.fuse_layers.0.3.1", true); auto relu_981 = network->addActivation(*add_980->getOutput(0), ActivationType::kRELU); auto add_984 = convBnUpAdd(network, weightMap, *relu_830->getOutput(0), *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.1.0.0.0", "stage4.0.fuse_layers.1.0.0.1", false); auto add_1006 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_984->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.2.0", "stage4.0.fuse_layers.1.2.1", true); auto add_1028 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1006->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.3.0", "stage4.0.fuse_layers.1.3.1", true); auto relu_1029 = network->addActivation(*add_1028->getOutput(0), ActivationType::kRELU); auto relu_1032 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.2.0.0.0", "stage4.0.fuse_layers.2.0.0.1"); auto bn_1034 = convBnRelu(network, weightMap, *relu_1032->getOutput(0), width * 4, 3, 2, 1, "stage4.0.fuse_layers.2.0.1.0", "stage4.0.fuse_layers.2.0.1.1", false); auto add_1037 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *bn_1034->getOutput(0), width * 4, 3, 2, 1, "stage4.0.fuse_layers.2.1.0.0", "stage4.0.fuse_layers.2.1.0.1", false); auto add_1038 = network->addElementWise(*relu_886->getOutput(0), *add_1037->getOutput(0), ElementWiseOperation::kSUM); auto add_1060 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1038->getOutput(0), width * 4, 1, 1, 0, "stage4.0.fuse_layers.2.3.0", "stage4.0.fuse_layers.2.3.1", true); auto relu_1061 = network->addActivation(*add_1060->getOutput(0), ActivationType::kRELU); auto relu_1064 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.0.0", "stage4.0.fuse_layers.3.0.0.1"); auto relu_1067 = convBnRelu(network, weightMap, *relu_1064->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.1.0", "stage4.0.fuse_layers.3.0.1.1"); auto bn_1069 = convBnRelu(network, weightMap, *relu_1067->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.0.2.0", "stage4.0.fuse_layers.3.0.2.1", false); auto relu_1072 = convBnRelu(network, weightMap, *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.3.1.0.0", "stage4.0.fuse_layers.3.1.0.1"); auto add_1075 = convBnUpAdd(network, weightMap, *relu_1072->getOutput(0), *bn_1069->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.1.1.0", "stage4.0.fuse_layers.3.1.1.1", false); auto add_1078 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_1075->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.2.0.0", "stage4.0.fuse_layers.3.2.0.1", false); auto add_1079 = network->addElementWise(*relu_914->getOutput(0), *add_1078->getOutput(0), ElementWiseOperation::kSUM); auto relu_1080 = network->addActivation(*add_1079->getOutput(0), ActivationType::kRELU); auto relu_1087 = liteResBlock(network, weightMap, *relu_981->getOutput(0), width, "stage4.1.branches.0.0"); auto relu_1094 = liteResBlock(network, weightMap, *relu_1087->getOutput(0), width, "stage4.1.branches.0.1"); auto relu_1101 = liteResBlock(network, weightMap, *relu_1094->getOutput(0), width, "stage4.1.branches.0.2"); auto relu_1108 = liteResBlock(network, weightMap, *relu_1101->getOutput(0), width, "stage4.1.branches.0.3"); auto relu_1115 = liteResBlock(network, weightMap, *relu_1029->getOutput(0), width * 2, "stage4.1.branches.1.0"); auto relu_1122 = liteResBlock(network, weightMap, *relu_1115->getOutput(0), width * 2, "stage4.1.branches.1.1"); auto relu_1129 = liteResBlock(network, weightMap, *relu_1122->getOutput(0), width * 2, "stage4.1.branches.1.2"); auto relu_1136 = liteResBlock(network, weightMap, *relu_1129->getOutput(0), width * 2, "stage4.1.branches.1.3"); auto relu_1143 = liteResBlock(network, weightMap, *relu_1061->getOutput(0), width * 4, "stage4.1.branches.2.0"); auto relu_1150 = liteResBlock(network, weightMap, *relu_1143->getOutput(0), width * 4, "stage4.1.branches.2.1"); auto relu_1157 = liteResBlock(network, weightMap, *relu_1150->getOutput(0), width * 4, "stage4.1.branches.2.2"); auto relu_1164 = liteResBlock(network, weightMap, *relu_1157->getOutput(0), width * 4, "stage4.1.branches.2.3"); auto relu_1171 = liteResBlock(network, weightMap, *relu_1080->getOutput(0), width * 8, "stage4.1.branches.3.0"); auto relu_1178 = liteResBlock(network, weightMap, *relu_1171->getOutput(0), width * 8, "stage4.1.branches.3.1"); auto relu_1185 = liteResBlock(network, weightMap, *relu_1178->getOutput(0), width * 8, "stage4.1.branches.3.2"); auto relu_1192 = liteResBlock(network, weightMap, *relu_1185->getOutput(0), width * 8, "stage4.1.branches.3.3"); auto add_1214 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *relu_1108->getOutput(0), width, 1, 1, 0, "stage4.1.fuse_layers.0.1.0", "stage4.1.fuse_layers.0.1.1", true); auto add_1236 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1214->getOutput(0), width, 1, 1, 0, "stage4.1.fuse_layers.0.2.0", "stage4.1.fuse_layers.0.2.1", true); auto add_1258 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1236->getOutput(0), width, 1, 1, 0, "stage4.1.fuse_layers.0.3.0", "stage4.1.fuse_layers.0.3.1", true); auto relu_1259 = network->addActivation(*add_1258->getOutput(0), ActivationType::kRELU); auto add_1262 = convBnUpAdd(network, weightMap, *relu_1108->getOutput(0), *relu_1136->getOutput(0), width * 2, 3, 2, 1, "stage4.1.fuse_layers.1.0.0.0", "stage4.1.fuse_layers.1.0.0.1", false); auto add_1284 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1262->getOutput(0), width * 2, 1, 1, 0, "stage4.1.fuse_layers.1.2.0", "stage4.1.fuse_layers.1.2.1", true); auto add_1306 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1284->getOutput(0), width * 2, 1, 1, 0, "stage4.1.fuse_layers.1.3.0", "stage4.1.fuse_layers.1.3.1", true); auto relu_1307 = network->addActivation(*add_1306->getOutput(0), ActivationType::kRELU); auto relu_1310 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.2.0.0.0", "stage4.1.fuse_layers.2.0.0.1"); auto bn_1312 = convBnRelu(network, weightMap, *relu_1310->getOutput(0), width * 4, 3, 2, 1, "stage4.1.fuse_layers.2.0.1.0", "stage4.1.fuse_layers.2.0.1.1", false); auto add_1315 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *bn_1312->getOutput(0), width * 4, 3, 2, 1, "stage4.1.fuse_layers.2.1.0.0", "stage4.1.fuse_layers.2.1.0.1", false); auto add_1316 = network->addElementWise(*relu_1164->getOutput(0), *add_1315->getOutput(0), ElementWiseOperation::kSUM); auto add_1338 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1316->getOutput(0), width * 4, 1, 1, 0, "stage4.1.fuse_layers.2.3.0", "stage4.1.fuse_layers.2.3.1", true); auto relu_1339 = network->addActivation(*add_1338->getOutput(0), ActivationType::kRELU); auto relu_1342 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.0.0", "stage4.1.fuse_layers.3.0.0.1"); auto relu_1345 = convBnRelu(network, weightMap, *relu_1342->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.1.0", "stage4.1.fuse_layers.3.0.1.1"); auto bn_1347 = convBnRelu(network, weightMap, *relu_1345->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.0.2.0", "stage4.1.fuse_layers.3.0.2.1", false); auto relu_1350 = convBnRelu(network, weightMap, *relu_1136->getOutput(0), width * 2, 3, 2, 1, "stage4.1.fuse_layers.3.1.0.0", "stage4.1.fuse_layers.3.1.0.1"); auto add_1353 = convBnUpAdd(network, weightMap, *relu_1350->getOutput(0), *bn_1347->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.1.1.0", "stage4.1.fuse_layers.3.1.1.1", false); auto add_1356 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1353->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.2.0.0", "stage4.1.fuse_layers.3.2.0.1", false); auto add_1357 = network->addElementWise(*relu_1192->getOutput(0), *add_1356->getOutput(0), ElementWiseOperation::kSUM); auto relu_1358 = network->addActivation(*add_1357->getOutput(0), ActivationType::kRELU); auto relu_1365 = liteResBlock(network, weightMap, *relu_1259->getOutput(0), width, "stage4.2.branches.0.0"); auto relu_1372 = liteResBlock(network, weightMap, *relu_1365->getOutput(0), width, "stage4.2.branches.0.1"); auto relu_1379 = liteResBlock(network, weightMap, *relu_1372->getOutput(0), width, "stage4.2.branches.0.2"); auto relu_1386 = liteResBlock(network, weightMap, *relu_1379->getOutput(0), width, "stage4.2.branches.0.3"); auto relu_1393 = liteResBlock(network, weightMap, *relu_1307->getOutput(0), width * 2, "stage4.2.branches.1.0"); auto relu_1400 = liteResBlock(network, weightMap, *relu_1393->getOutput(0), width * 2, "stage4.2.branches.1.1"); auto relu_1407 = liteResBlock(network, weightMap, *relu_1400->getOutput(0), width * 2, "stage4.2.branches.1.2"); auto relu_1414 = liteResBlock(network, weightMap, *relu_1407->getOutput(0), width * 2, "stage4.2.branches.1.3"); auto relu_1421 = liteResBlock(network, weightMap, *relu_1339->getOutput(0), width * 4, "stage4.2.branches.2.0"); auto relu_1428 = liteResBlock(network, weightMap, *relu_1421->getOutput(0), width * 4, "stage4.2.branches.2.1"); auto relu_1435 = liteResBlock(network, weightMap, *relu_1428->getOutput(0), width * 4, "stage4.2.branches.2.2"); auto relu_1442 = liteResBlock(network, weightMap, *relu_1435->getOutput(0), width * 4, "stage4.2.branches.2.3"); auto relu_1449 = liteResBlock(network, weightMap, *relu_1358->getOutput(0), width * 8, "stage4.2.branches.3.0"); auto relu_1456 = liteResBlock(network, weightMap, *relu_1449->getOutput(0), width * 8, "stage4.2.branches.3.1"); auto relu_1463 = liteResBlock(network, weightMap, *relu_1456->getOutput(0), width * 8, "stage4.2.branches.3.2"); auto relu_1470 = liteResBlock(network, weightMap, *relu_1463->getOutput(0), width * 8, "stage4.2.branches.3.3"); auto add_1492 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *relu_1386->getOutput(0), width, 1, 1, 0, "stage4.2.fuse_layers.0.1.0", "stage4.2.fuse_layers.0.1.1", true); auto add_1514 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1492->getOutput(0), width, 1, 1, 0, "stage4.2.fuse_layers.0.2.0", "stage4.2.fuse_layers.0.2.1", true); auto add_1536 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1514->getOutput(0), width, 1, 1, 0, "stage4.2.fuse_layers.0.3.0", "stage4.2.fuse_layers.0.3.1", true); auto relu_1537 = network->addActivation(*add_1536->getOutput(0), ActivationType::kRELU); auto add_1540 = convBnUpAdd(network, weightMap, *relu_1386->getOutput(0), *relu_1414->getOutput(0), width * 2, 3, 2, 1, "stage4.2.fuse_layers.1.0.0.0", "stage4.2.fuse_layers.1.0.0.1", false); auto add_1562 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1540->getOutput(0), width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.2.0", "stage4.2.fuse_layers.1.2.1", true); auto add_1584 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1562->getOutput(0), width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.3.0", "stage4.2.fuse_layers.1.3.1", true); auto relu_1585 = network->addActivation(*add_1584->getOutput(0), ActivationType::kRELU); auto relu_1588 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.2.0.0.0", "stage4.2.fuse_layers.2.0.0.1"); auto bn_1590 = convBnRelu(network, weightMap, *relu_1588->getOutput(0), width * 4, 3, 2, 1, "stage4.2.fuse_layers.2.0.1.0", "stage4.2.fuse_layers.2.0.1.1", false); auto add_1593 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *bn_1590->getOutput(0), width * 4, 3, 2, 1, "stage4.2.fuse_layers.2.1.0.0", "stage4.2.fuse_layers.2.1.0.1", false); auto add_1594 = network->addElementWise(*relu_1442->getOutput(0), *add_1593->getOutput(0), ElementWiseOperation::kSUM); auto add_1616 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1594->getOutput(0), width * 4, 1, 1, 0, "stage4.2.fuse_layers.2.3.0", "stage4.2.fuse_layers.2.3.1", true); auto relu_1617 = network->addActivation(*add_1616->getOutput(0), ActivationType::kRELU); auto relu_1620 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.0.0", "stage4.2.fuse_layers.3.0.0.1"); auto relu_1623 = convBnRelu(network, weightMap, *relu_1620->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.1.0", "stage4.2.fuse_layers.3.0.1.1"); auto bn_1625 = convBnRelu(network, weightMap, *relu_1623->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.0.2.0", "stage4.2.fuse_layers.3.0.2.1", false); auto relu_1628 = convBnRelu(network, weightMap, *relu_1414->getOutput(0), width * 2, 3, 2, 1, "stage4.2.fuse_layers.3.1.0.0", "stage4.2.fuse_layers.3.1.0.1"); auto add_1631 = convBnUpAdd(network, weightMap, *relu_1628->getOutput(0), *bn_1625->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.1.1.0", "stage4.2.fuse_layers.3.1.1.1", false); auto add_1634 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1631->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.2.0.0", "stage4.2.fuse_layers.3.2.0.1", false); auto add_1635 = network->addElementWise(*relu_1470->getOutput(0), *add_1634->getOutput(0), ElementWiseOperation::kSUM); auto relu_1636 = network->addActivation(*add_1635->getOutput(0), ActivationType::kRELU); nvinfer1::Dims dim = relu_1537->getOutput(0)->getDimensions(); dim.d[0] = relu_1585->getOutput(0)->getDimensions().d[0]; auto resize_1655 = netAddUpsampleBi(network, relu_1585->getOutput(0), dim); dim.d[0] = relu_1617->getOutput(0)->getDimensions().d[0]; auto resize_1668 = netAddUpsampleBi(network, relu_1617->getOutput(0), dim); dim.d[0] = relu_1636->getOutput(0)->getDimensions().d[0]; auto resize_1681 = netAddUpsampleBi(network, relu_1636->getOutput(0), dim); ITensor *concatTensors[] = {relu_1537->getOutput(0), resize_1655->getOutput(0), resize_1668->getOutput(0), resize_1681->getOutput(0)}; auto concat_1682 = network->addConcatenation(concatTensors, 4); concat_1682->setAxis(0); auto relu_1685 = convBnRelu(network, weightMap, *concat_1682->getOutput(0), width * 15, 1, 1, 0, "last_layer.0", "last_layer.1", true, true); auto conv_1686 = network->addConvolutionNd(*relu_1685->getOutput(0), NUM_CLASSES, DimsHW{1, 1}, weightMap["last_layer.3.weight"], weightMap["last_layer.3.bias"]); conv_1686->setStrideNd(DimsHW{1, 1}); conv_1686->setPaddingNd(DimsHW{0, 0}); debug_print(conv_1686->getOutput(0), "conv_1686"); dim.d[0] = NUM_CLASSES; dim.d[1] = INPUT_H; dim.d[2] = INPUT_W; auto feature_map = netAddUpsampleBi(network, conv_1686->getOutput(0), dim); debug_print(feature_map->getOutput(0), "feature_map"); auto topk = network->addTopK(*feature_map->getOutput(0), TopKOperation::kMAX, 1, 0X01); debug_print(topk->getOutput(0), "topk"); std::cout << "set name out" << std::endl; // topk->getOutput(1) 1 is index topk->getOutput(1)->setName(OUTPUT_BLOB_NAME); network->markOutput(*topk->getOutput(1)); builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize((1 << 30)); // 1G #ifdef USE_FP16 std::cout << "use fp16" << std::endl; config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build success!" << std::endl; network->destroy(); for (auto &mem : weightMap) { free((void *)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, int width) { IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, width); assert(engine != nullptr); (*modelStream) = engine->serialize(); engine->destroy(); builder->destroy(); } bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, int &width, std::string &img_dir) { if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); width = std::stoi(argv[4]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void doInference(IExecutionContext &context, cudaStream_t &stream, void **buffers, int batchSize) { context.enqueue(batchSize, buffers, stream, nullptr); cudaStreamSynchronize(stream); cudaDeviceSynchronize(); } int main(int argc, char **argv) { cudaSetDevice(DEVICE); std::string wtsPath = ""; std::string engine_name = ""; int width; std::string img_dir; // parse args if (!parse_args(argc, argv, wtsPath, engine_name, width, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./hrnet -s [.wts] [.engine] [18 or 32 or 48] // serialize model to plan file" << std::endl; std::cerr << "./hrnet -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream if (!wtsPath.empty()) { IHostMemory *modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream, wtsPath, width); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } // deserialize the .engine and run inference char *trtModelStream{nullptr}; size_t size{0}; std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } else { std::cerr << "could not open plan file" << std::endl; } std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- cudaSetDeviceFlags(cudaDeviceMapHost); float *data; int *prob; // using int. output is index CHECK(cudaHostAlloc((void **)&data, BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float), cudaHostAllocMapped)); CHECK(cudaHostAlloc((void **)&prob, BATCH_SIZE * OUTPUT_SIZE * sizeof(int), cudaHostAllocMapped)); IRuntime *runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext *context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; void *buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); for (int f = 0; f < (int)file_names.size(); f++) { std::cout << file_names[f] << std::endl; cv::Mat pr_img; cv::Mat img_BGR = cv::imread(img_dir + "/" + file_names[f], 1); // BGR cv::Mat img; cv::cvtColor(img_BGR, img, cv::COLOR_BGR2RGB); if (img.empty()) continue; cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H)); img = pr_img.clone(); // for img show pr_img.convertTo(pr_img, CV_32FC3); if (!pr_img.isContinuous()) { pr_img = pr_img.clone(); } std::memcpy(data, pr_img.data, BATCH_SIZE * 3 * INPUT_W * INPUT_H * sizeof(float)); cudaHostGetDevicePointer((void **)&buffers[inputIndex], (void *)data, 0); // buffers[inputIndex]-->data cudaHostGetDevicePointer((void **)&buffers[outputIndex], (void *)prob, 0); // buffers[outputIndex] --> prob // Run inference auto start = std::chrono::high_resolution_clock::now(); doInference(*context, stream, buffers, BATCH_SIZE); auto end = std::chrono::high_resolution_clock::now(); std::cout << "infer time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat outimg(INPUT_H, INPUT_W, CV_8UC1); for (int row = 0; row < INPUT_H; ++row) { uchar *uc_pixel = outimg.data + row * outimg.step; for (int col = 0; col < INPUT_W; ++col) { uc_pixel[col] = (uchar)prob[row * INPUT_W + col]; } } cv::Mat im_color; cv::cvtColor(outimg, im_color, cv::COLOR_GRAY2RGB); cv::Mat lut = createLTU(NUM_CLASSES); cv::LUT(im_color, lut, im_color); // false color cv::cvtColor(im_color, im_color, cv::COLOR_RGB2GRAY); cv::applyColorMap(im_color, im_color, cv::COLORMAP_HOT); // cv::imshow("False Color Map", im_color); cv::imwrite(std::to_string(f) + "_false_color_map.png", im_color); //fusion cv::Mat fusionImg; cv::addWeighted(img, 1, im_color, 0.8, 1, fusionImg); // cv::imshow("Fusion Img", fusionImg); // cv::waitKey(0); cv::imwrite(std::to_string(f) + "_fusion_img.png", fusionImg); } // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFreeHost(buffers[inputIndex])); CHECK(cudaFreeHost(buffers[outputIndex])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: hrnet/hrnet-semantic-segmentation/hrnet_ocr.cpp ================================================ #include #include #include #include #include #include #include "common.hpp" #include "logging.h" static Logger gLogger; #define USE_FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // const char *INPUT_BLOB_NAME = "data"; const char *OUTPUT_BLOB_NAME = "output"; static const int INPUT_H = 512; static const int INPUT_W = 1024; static const int NUM_CLASSES = 19; static const int OUTPUT_SIZE = INPUT_H * INPUT_W; // Creat the engine using only the API and not any parser. ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, std::string wtsPath, int width) { INetworkDefinition *network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_H, INPUT_W, 3}); assert(data); // hwc to chw auto ps = network->addShuffle(*data); ps->setFirstTranspose(nvinfer1::Permutation{2, 0, 1}); float mean[3] = {0.485, 0.456, 0.406}; float std[3] = {0.229, 0.224, 0.225}; ITensor *preinput = MeanStd(network, ps->getOutput(0), mean, std, true); std::map weightMap = loadWeights(wtsPath); auto relu_2 = convBnRelu(network, weightMap, *preinput, 64, 3, 2, 1, "conv1", "bn1"); auto relu_5 = convBnRelu(network, weightMap, *relu_2->getOutput(0), 64, 3, 2, 1, "conv2", "bn2"); auto relu_17 = ResBlock2Conv(network, weightMap, *relu_5->getOutput(0), 64, 256, 1, "layer1.0"); auto relu_27 = ResBlock(network, weightMap, *relu_17->getOutput(0), 256, 64, 1, "layer1.1"); auto relu_37 = ResBlock(network, weightMap, *relu_27->getOutput(0), 256, 64, 1, "layer1.2"); auto relu_47 = ResBlock(network, weightMap, *relu_37->getOutput(0), 256, 64, 1, "layer1.3"); auto relu_50 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width, 3, 1, 1, "transition1.0.0", "transition1.0.1"); auto relu_60 = liteResBlock(network, weightMap, *relu_50->getOutput(0), width, "stage2.0.branches.0.0"); auto relu_67 = liteResBlock(network, weightMap, *relu_60->getOutput(0), width, "stage2.0.branches.0.1"); auto relu_74 = liteResBlock(network, weightMap, *relu_67->getOutput(0), width, "stage2.0.branches.0.2"); auto relu_81 = liteResBlock(network, weightMap, *relu_74->getOutput(0), width, "stage2.0.branches.0.3"); auto relu_53 = convBnRelu(network, weightMap, *relu_47->getOutput(0), width * 2, 3, 2, 1, "transition1.1.0.0", "transition1.1.0.1"); auto relu_88 = liteResBlock(network, weightMap, *relu_53->getOutput(0), width * 2, "stage2.0.branches.1.0"); auto relu_95 = liteResBlock(network, weightMap, *relu_88->getOutput(0), width * 2, "stage2.0.branches.1.1"); auto relu_102 = liteResBlock(network, weightMap, *relu_95->getOutput(0), width * 2, "stage2.0.branches.1.2"); auto relu_109 = liteResBlock(network, weightMap, *relu_102->getOutput(0), width * 2, "stage2.0.branches.1.3"); auto add_131 = convBnUpAdd(network, weightMap, *relu_109->getOutput(0), *relu_81->getOutput(0), width, 1, 1, 0, "stage2.0.fuse_layers.0.1.0", "stage2.0.fuse_layers.0.1.1", true); auto relu_132 = network->addActivation(*add_131->getOutput(0), ActivationType::kRELU); auto add_135 = convBnUpAdd(network, weightMap, *relu_81->getOutput(0), *relu_109->getOutput(0), width * 2, 3, 2, 1, "stage2.0.fuse_layers.1.0.0.0", "stage2.0.fuse_layers.1.0.0.1", false); auto relu_136 = network->addActivation(*add_135->getOutput(0), ActivationType::kRELU); auto relu_146 = liteResBlock(network, weightMap, *relu_132->getOutput(0), width, "stage3.0.branches.0.0"); auto relu_153 = liteResBlock(network, weightMap, *relu_146->getOutput(0), width, "stage3.0.branches.0.1"); auto relu_160 = liteResBlock(network, weightMap, *relu_153->getOutput(0), width, "stage3.0.branches.0.2"); auto relu_167 = liteResBlock(network, weightMap, *relu_160->getOutput(0), width, "stage3.0.branches.0.3"); auto relu_174 = liteResBlock(network, weightMap, *relu_136->getOutput(0), width * 2, "stage3.0.branches.1.0"); auto relu_181 = liteResBlock(network, weightMap, *relu_174->getOutput(0), width * 2, "stage3.0.branches.1.1"); auto relu_188 = liteResBlock(network, weightMap, *relu_181->getOutput(0), width * 2, "stage3.0.branches.1.2"); auto relu_195 = liteResBlock(network, weightMap, *relu_188->getOutput(0), width * 2, "stage3.0.branches.1.3"); auto relu_139 = convBnRelu(network, weightMap, *relu_136->getOutput(0), width * 4, 3, 2, 1, "transition2.2.0.0", "transition2.2.0.1"); auto relu_202 = liteResBlock(network, weightMap, *relu_139->getOutput(0), width * 4, "stage3.0.branches.2.0"); auto relu_209 = liteResBlock(network, weightMap, *relu_202->getOutput(0), width * 4, "stage3.0.branches.2.1"); auto relu_216 = liteResBlock(network, weightMap, *relu_209->getOutput(0), width * 4, "stage3.0.branches.2.2"); auto relu_223 = liteResBlock(network, weightMap, *relu_216->getOutput(0), width * 4, "stage3.0.branches.2.3"); auto add_245 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *relu_167->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.1.0", "stage3.0.fuse_layers.0.1.1", true); auto add_267 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_245->getOutput(0), width, 1, 1, 0, "stage3.0.fuse_layers.0.2.0", "stage3.0.fuse_layers.0.2.1", true); auto relu_268 = network->addActivation(*add_267->getOutput(0), ActivationType::kRELU); auto add_271 = convBnUpAdd(network, weightMap, *relu_167->getOutput(0), *relu_195->getOutput(0), width * 2, 3, 2, 1, "stage3.0.fuse_layers.1.0.0.0", "stage3.0.fuse_layers.1.0.0.1", false); auto add_293 = convBnUpAdd(network, weightMap, *relu_223->getOutput(0), *add_271->getOutput(0), width * 2, 1, 1, 0, "stage3.0.fuse_layers.1.2.0", "stage3.0.fuse_layers.1.2.1", true); auto relu_294 = network->addActivation(*add_293->getOutput(0), ActivationType::kRELU); auto relu_297 = convBnRelu(network, weightMap, *relu_167->getOutput(0), width, 3, 2, 1, "stage3.0.fuse_layers.2.0.0.0", "stage3.0.fuse_layers.2.0.0.1"); auto bn_299 = convBnRelu(network, weightMap, *relu_297->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.0.1.0", "stage3.0.fuse_layers.2.0.1.1", false); auto add_302 = convBnUpAdd(network, weightMap, *relu_195->getOutput(0), *bn_299->getOutput(0), width * 4, 3, 2, 1, "stage3.0.fuse_layers.2.1.0.0", "stage3.0.fuse_layers.2.1.0.1", false); auto add_303 = network->addElementWise(*add_302->getOutput(0), *relu_223->getOutput(0), ElementWiseOperation::kSUM); auto relu_304 = network->addActivation(*add_303->getOutput(0), ActivationType::kRELU); auto relu_311 = liteResBlock(network, weightMap, *relu_268->getOutput(0), width, "stage3.1.branches.0.0"); auto relu_318 = liteResBlock(network, weightMap, *relu_311->getOutput(0), width, "stage3.1.branches.0.1"); auto relu_325 = liteResBlock(network, weightMap, *relu_318->getOutput(0), width, "stage3.1.branches.0.2"); auto relu_332 = liteResBlock(network, weightMap, *relu_325->getOutput(0), width, "stage3.1.branches.0.3"); auto relu_339 = liteResBlock(network, weightMap, *relu_294->getOutput(0), width * 2, "stage3.1.branches.1.0"); auto relu_346 = liteResBlock(network, weightMap, *relu_339->getOutput(0), width * 2, "stage3.1.branches.1.1"); auto relu_353 = liteResBlock(network, weightMap, *relu_346->getOutput(0), width * 2, "stage3.1.branches.1.2"); auto relu_360 = liteResBlock(network, weightMap, *relu_353->getOutput(0), width * 2, "stage3.1.branches.1.3"); auto relu_367 = liteResBlock(network, weightMap, *relu_304->getOutput(0), width * 4, "stage3.1.branches.2.0"); auto relu_374 = liteResBlock(network, weightMap, *relu_367->getOutput(0), width * 4, "stage3.1.branches.2.1"); auto relu_381 = liteResBlock(network, weightMap, *relu_374->getOutput(0), width * 4, "stage3.1.branches.2.2"); auto relu_388 = liteResBlock(network, weightMap, *relu_381->getOutput(0), width * 4, "stage3.1.branches.2.3"); auto add_410 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *relu_332->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.1.0", "stage3.1.fuse_layers.0.1.1", true); auto add_432 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_410->getOutput(0), width, 1, 1, 0, "stage3.1.fuse_layers.0.2.0", "stage3.1.fuse_layers.0.2.1", true); auto relu_433 = network->addActivation(*add_432->getOutput(0), ActivationType::kRELU); auto add_436 = convBnUpAdd(network, weightMap, *relu_332->getOutput(0), *relu_360->getOutput(0), width * 2, 3, 2, 1, "stage3.1.fuse_layers.1.0.0.0", "stage3.1.fuse_layers.1.0.0.1", false); auto add_458 = convBnUpAdd(network, weightMap, *relu_388->getOutput(0), *add_436->getOutput(0), width * 2, 1, 1, 0, "stage3.1.fuse_layers.1.2.0", "stage3.1.fuse_layers.1.2.1", true); auto relu_459 = network->addActivation(*add_458->getOutput(0), ActivationType::kRELU); auto relu_462 = convBnRelu(network, weightMap, *relu_332->getOutput(0), width, 3, 2, 1, "stage3.1.fuse_layers.2.0.0.0", "stage3.1.fuse_layers.2.0.0.1"); auto bn_464 = convBnRelu(network, weightMap, *relu_462->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.0.1.0", "stage3.1.fuse_layers.2.0.1.1", false); auto add_467 = convBnUpAdd(network, weightMap, *relu_360->getOutput(0), *bn_464->getOutput(0), width * 4, 3, 2, 1, "stage3.1.fuse_layers.2.1.0.0", "stage3.1.fuse_layers.2.1.0.1", false); auto add_468 = network->addElementWise(*add_467->getOutput(0), *relu_388->getOutput(0), ElementWiseOperation::kSUM); auto relu_469 = network->addActivation(*add_468->getOutput(0), ActivationType::kRELU); auto relu_476 = liteResBlock(network, weightMap, *relu_433->getOutput(0), width, "stage3.2.branches.0.0"); auto relu_483 = liteResBlock(network, weightMap, *relu_476->getOutput(0), width, "stage3.2.branches.0.1"); auto relu_490 = liteResBlock(network, weightMap, *relu_483->getOutput(0), width, "stage3.2.branches.0.2"); auto relu_497 = liteResBlock(network, weightMap, *relu_490->getOutput(0), width, "stage3.2.branches.0.3"); auto relu_504 = liteResBlock(network, weightMap, *relu_459->getOutput(0), width * 2, "stage3.2.branches.1.0"); auto relu_511 = liteResBlock(network, weightMap, *relu_504->getOutput(0), width * 2, "stage3.2.branches.1.1"); auto relu_518 = liteResBlock(network, weightMap, *relu_511->getOutput(0), width * 2, "stage3.2.branches.1.2"); auto relu_525 = liteResBlock(network, weightMap, *relu_518->getOutput(0), width * 2, "stage3.2.branches.1.3"); auto relu_532 = liteResBlock(network, weightMap, *relu_469->getOutput(0), width * 4, "stage3.2.branches.2.0"); auto relu_539 = liteResBlock(network, weightMap, *relu_532->getOutput(0), width * 4, "stage3.2.branches.2.1"); auto relu_546 = liteResBlock(network, weightMap, *relu_539->getOutput(0), width * 4, "stage3.2.branches.2.2"); auto relu_553 = liteResBlock(network, weightMap, *relu_546->getOutput(0), width * 4, "stage3.2.branches.2.3"); auto add_575 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *relu_497->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.1.0", "stage3.2.fuse_layers.0.1.1", true); auto add_597 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_575->getOutput(0), width, 1, 1, 0, "stage3.2.fuse_layers.0.2.0", "stage3.2.fuse_layers.0.2.1", true); auto relu_598 = network->addActivation(*add_597->getOutput(0), ActivationType::kRELU); auto add_601 = convBnUpAdd(network, weightMap, *relu_497->getOutput(0), *relu_525->getOutput(0), width * 2, 3, 2, 1, "stage3.2.fuse_layers.1.0.0.0", "stage3.2.fuse_layers.1.0.0.1", false); auto add_623 = convBnUpAdd(network, weightMap, *relu_553->getOutput(0), *add_601->getOutput(0), width * 2, 1, 1, 0, "stage3.2.fuse_layers.1.2.0", "stage3.2.fuse_layers.1.2.1", true); auto relu_624 = network->addActivation(*add_623->getOutput(0), ActivationType::kRELU); auto relu_627 = convBnRelu(network, weightMap, *relu_497->getOutput(0), width, 3, 2, 1, "stage3.2.fuse_layers.2.0.0.0", "stage3.2.fuse_layers.2.0.0.1"); auto bn_629 = convBnRelu(network, weightMap, *relu_627->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.0.1.0", "stage3.2.fuse_layers.2.0.1.1", false); auto add_632 = convBnUpAdd(network, weightMap, *relu_525->getOutput(0), *bn_629->getOutput(0), width * 4, 3, 2, 1, "stage3.2.fuse_layers.2.1.0.0", "stage3.2.fuse_layers.2.1.0.1", false); auto add_633 = network->addElementWise(*relu_553->getOutput(0), *add_632->getOutput(0), ElementWiseOperation::kSUM); auto relu_634 = network->addActivation(*add_633->getOutput(0), ActivationType::kRELU); auto relu_641 = liteResBlock(network, weightMap, *relu_598->getOutput(0), width, "stage3.3.branches.0.0"); auto relu_648 = liteResBlock(network, weightMap, *relu_641->getOutput(0), width, "stage3.3.branches.0.1"); auto relu_655 = liteResBlock(network, weightMap, *relu_648->getOutput(0), width, "stage3.3.branches.0.2"); auto relu_662 = liteResBlock(network, weightMap, *relu_655->getOutput(0), width, "stage3.3.branches.0.3"); auto relu_669 = liteResBlock(network, weightMap, *relu_624->getOutput(0), width * 2, "stage3.3.branches.1.0"); auto relu_676 = liteResBlock(network, weightMap, *relu_669->getOutput(0), width * 2, "stage3.3.branches.1.1"); auto relu_683 = liteResBlock(network, weightMap, *relu_676->getOutput(0), width * 2, "stage3.3.branches.1.2"); auto relu_690 = liteResBlock(network, weightMap, *relu_683->getOutput(0), width * 2, "stage3.3.branches.1.3"); auto relu_697 = liteResBlock(network, weightMap, *relu_634->getOutput(0), width * 4, "stage3.3.branches.2.0"); auto relu_704 = liteResBlock(network, weightMap, *relu_697->getOutput(0), width * 4, "stage3.3.branches.2.1"); auto relu_711 = liteResBlock(network, weightMap, *relu_704->getOutput(0), width * 4, "stage3.3.branches.2.2"); auto relu_718 = liteResBlock(network, weightMap, *relu_711->getOutput(0), width * 4, "stage3.3.branches.2.3"); auto add_740 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *relu_662->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.1.0", "stage3.3.fuse_layers.0.1.1", true); auto add_762 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_740->getOutput(0), width, 1, 1, 0, "stage3.3.fuse_layers.0.2.0", "stage3.3.fuse_layers.0.2.1", true); auto relu_763 = network->addActivation(*add_762->getOutput(0), ActivationType::kRELU); auto add_766 = convBnUpAdd(network, weightMap, *relu_662->getOutput(0), *relu_690->getOutput(0), width * 2, 3, 2, 1, "stage3.3.fuse_layers.1.0.0.0", "stage3.3.fuse_layers.1.0.0.1", false); auto add_788 = convBnUpAdd(network, weightMap, *relu_718->getOutput(0), *add_766->getOutput(0), width * 2, 1, 1, 0, "stage3.3.fuse_layers.1.2.0", "stage3.3.fuse_layers.1.2.1", true); auto relu_789 = network->addActivation(*add_788->getOutput(0), ActivationType::kRELU); auto relu_792 = convBnRelu(network, weightMap, *relu_662->getOutput(0), width, 3, 2, 1, "stage3.3.fuse_layers.2.0.0.0", "stage3.3.fuse_layers.2.0.0.1"); auto bn_794 = convBnRelu(network, weightMap, *relu_792->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.0.1.0", "stage3.3.fuse_layers.2.0.1.1", false); auto add_797 = convBnUpAdd(network, weightMap, *relu_690->getOutput(0), *bn_794->getOutput(0), width * 4, 3, 2, 1, "stage3.3.fuse_layers.2.1.0.0", "stage3.3.fuse_layers.2.1.0.1", false); auto add_798 = network->addElementWise(*relu_718->getOutput(0), *add_797->getOutput(0), ElementWiseOperation::kSUM); auto relu_799 = network->addActivation(*add_798->getOutput(0), ActivationType::kRELU); auto relu_809 = liteResBlock(network, weightMap, *relu_763->getOutput(0), width, "stage4.0.branches.0.0"); auto relu_816 = liteResBlock(network, weightMap, *relu_809->getOutput(0), width, "stage4.0.branches.0.1"); auto relu_823 = liteResBlock(network, weightMap, *relu_816->getOutput(0), width, "stage4.0.branches.0.2"); auto relu_830 = liteResBlock(network, weightMap, *relu_823->getOutput(0), width, "stage4.0.branches.0.3"); auto relu_837 = liteResBlock(network, weightMap, *relu_789->getOutput(0), width * 2, "stage4.0.branches.1.0"); auto relu_844 = liteResBlock(network, weightMap, *relu_837->getOutput(0), width * 2, "stage4.0.branches.1.1"); auto relu_851 = liteResBlock(network, weightMap, *relu_844->getOutput(0), width * 2, "stage4.0.branches.1.2"); auto relu_858 = liteResBlock(network, weightMap, *relu_851->getOutput(0), width * 2, "stage4.0.branches.1.3"); auto relu_865 = liteResBlock(network, weightMap, *relu_799->getOutput(0), width * 4, "stage4.0.branches.2.0"); auto relu_872 = liteResBlock(network, weightMap, *relu_865->getOutput(0), width * 4, "stage4.0.branches.2.1"); auto relu_879 = liteResBlock(network, weightMap, *relu_872->getOutput(0), width * 4, "stage4.0.branches.2.2"); auto relu_886 = liteResBlock(network, weightMap, *relu_879->getOutput(0), width * 4, "stage4.0.branches.2.3"); //======== auto relu_802 = convBnRelu(network, weightMap, *relu_799->getOutput(0), width * 8, 3, 2, 1, "transition3.3.0.0", "transition3.3.0.1"); auto relu_893 = liteResBlock(network, weightMap, *relu_802->getOutput(0), width * 8, "stage4.0.branches.3.0"); auto relu_900 = liteResBlock(network, weightMap, *relu_893->getOutput(0), width * 8, "stage4.0.branches.3.1"); auto relu_907 = liteResBlock(network, weightMap, *relu_900->getOutput(0), width * 8, "stage4.0.branches.3.2"); auto relu_914 = liteResBlock(network, weightMap, *relu_907->getOutput(0), width * 8, "stage4.0.branches.3.3"); auto add_936 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *relu_830->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.1.0", "stage4.0.fuse_layers.0.1.1", true); auto add_958 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_936->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.2.0", "stage4.0.fuse_layers.0.2.1", true); auto add_980 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_958->getOutput(0), width, 1, 1, 0, "stage4.0.fuse_layers.0.3.0", "stage4.0.fuse_layers.0.3.1", true); auto relu_981 = network->addActivation(*add_980->getOutput(0), ActivationType::kRELU); auto add_984 = convBnUpAdd(network, weightMap, *relu_830->getOutput(0), *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.1.0.0.0", "stage4.0.fuse_layers.1.0.0.1", false); auto add_1006 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_984->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.2.0", "stage4.0.fuse_layers.1.2.1", true); auto add_1028 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1006->getOutput(0), width * 2, 1, 1, 0, "stage4.0.fuse_layers.1.3.0", "stage4.0.fuse_layers.1.3.1", true); auto relu_1029 = network->addActivation(*add_1028->getOutput(0), ActivationType::kRELU); auto relu_1032 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.2.0.0.0", "stage4.0.fuse_layers.2.0.0.1"); auto bn_1034 = convBnRelu(network, weightMap, *relu_1032->getOutput(0), width * 4, 3, 2, 1, "stage4.0.fuse_layers.2.0.1.0", "stage4.0.fuse_layers.2.0.1.1", false); auto add_1037 = convBnUpAdd(network, weightMap, *relu_858->getOutput(0), *bn_1034->getOutput(0), width * 4, 3, 2, 1, "stage4.0.fuse_layers.2.1.0.0", "stage4.0.fuse_layers.2.1.0.1", false); auto add_1038 = network->addElementWise(*relu_886->getOutput(0), *add_1037->getOutput(0), ElementWiseOperation::kSUM); auto add_1060 = convBnUpAdd(network, weightMap, *relu_914->getOutput(0), *add_1038->getOutput(0), width * 4, 1, 1, 0, "stage4.0.fuse_layers.2.3.0", "stage4.0.fuse_layers.2.3.1", true); auto relu_1061 = network->addActivation(*add_1060->getOutput(0), ActivationType::kRELU); auto relu_1064 = convBnRelu(network, weightMap, *relu_830->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.0.0", "stage4.0.fuse_layers.3.0.0.1"); auto relu_1067 = convBnRelu(network, weightMap, *relu_1064->getOutput(0), width, 3, 2, 1, "stage4.0.fuse_layers.3.0.1.0", "stage4.0.fuse_layers.3.0.1.1"); auto bn_1069 = convBnRelu(network, weightMap, *relu_1067->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.0.2.0", "stage4.0.fuse_layers.3.0.2.1", false); auto relu_1072 = convBnRelu(network, weightMap, *relu_858->getOutput(0), width * 2, 3, 2, 1, "stage4.0.fuse_layers.3.1.0.0", "stage4.0.fuse_layers.3.1.0.1"); auto add_1075 = convBnUpAdd(network, weightMap, *relu_1072->getOutput(0), *bn_1069->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.1.1.0", "stage4.0.fuse_layers.3.1.1.1", false); auto add_1078 = convBnUpAdd(network, weightMap, *relu_886->getOutput(0), *add_1075->getOutput(0), width * 8, 3, 2, 1, "stage4.0.fuse_layers.3.2.0.0", "stage4.0.fuse_layers.3.2.0.1", false); auto add_1079 = network->addElementWise(*relu_914->getOutput(0), *add_1078->getOutput(0), ElementWiseOperation::kSUM); auto relu_1080 = network->addActivation(*add_1079->getOutput(0), ActivationType::kRELU); auto relu_1087 = liteResBlock(network, weightMap, *relu_981->getOutput(0), width, "stage4.1.branches.0.0"); auto relu_1094 = liteResBlock(network, weightMap, *relu_1087->getOutput(0), width, "stage4.1.branches.0.1"); auto relu_1101 = liteResBlock(network, weightMap, *relu_1094->getOutput(0), width, "stage4.1.branches.0.2"); auto relu_1108 = liteResBlock(network, weightMap, *relu_1101->getOutput(0), width, "stage4.1.branches.0.3"); auto relu_1115 = liteResBlock(network, weightMap, *relu_1029->getOutput(0), width * 2, "stage4.1.branches.1.0"); auto relu_1122 = liteResBlock(network, weightMap, *relu_1115->getOutput(0), width * 2, "stage4.1.branches.1.1"); auto relu_1129 = liteResBlock(network, weightMap, *relu_1122->getOutput(0), width * 2, "stage4.1.branches.1.2"); auto relu_1136 = liteResBlock(network, weightMap, *relu_1129->getOutput(0), width * 2, "stage4.1.branches.1.3"); auto relu_1143 = liteResBlock(network, weightMap, *relu_1061->getOutput(0), width * 4, "stage4.1.branches.2.0"); auto relu_1150 = liteResBlock(network, weightMap, *relu_1143->getOutput(0), width * 4, "stage4.1.branches.2.1"); auto relu_1157 = liteResBlock(network, weightMap, *relu_1150->getOutput(0), width * 4, "stage4.1.branches.2.2"); auto relu_1164 = liteResBlock(network, weightMap, *relu_1157->getOutput(0), width * 4, "stage4.1.branches.2.3"); auto relu_1171 = liteResBlock(network, weightMap, *relu_1080->getOutput(0), width * 8, "stage4.1.branches.3.0"); auto relu_1178 = liteResBlock(network, weightMap, *relu_1171->getOutput(0), width * 8, "stage4.1.branches.3.1"); auto relu_1185 = liteResBlock(network, weightMap, *relu_1178->getOutput(0), width * 8, "stage4.1.branches.3.2"); auto relu_1192 = liteResBlock(network, weightMap, *relu_1185->getOutput(0), width * 8, "stage4.1.branches.3.3"); auto add_1214 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *relu_1108->getOutput(0), width, 1, 1, 0, "stage4.1.fuse_layers.0.1.0", "stage4.1.fuse_layers.0.1.1", true); auto add_1236 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1214->getOutput(0), width, 1, 1, 0, "stage4.1.fuse_layers.0.2.0", "stage4.1.fuse_layers.0.2.1", true); auto add_1258 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1236->getOutput(0), width, 1, 1, 0, "stage4.1.fuse_layers.0.3.0", "stage4.1.fuse_layers.0.3.1", true); auto relu_1259 = network->addActivation(*add_1258->getOutput(0), ActivationType::kRELU); auto add_1262 = convBnUpAdd(network, weightMap, *relu_1108->getOutput(0), *relu_1136->getOutput(0), width * 2, 3, 2, 1, "stage4.1.fuse_layers.1.0.0.0", "stage4.1.fuse_layers.1.0.0.1", false); auto add_1284 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1262->getOutput(0), width * 2, 1, 1, 0, "stage4.1.fuse_layers.1.2.0", "stage4.1.fuse_layers.1.2.1", true); auto add_1306 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1284->getOutput(0), width * 2, 1, 1, 0, "stage4.1.fuse_layers.1.3.0", "stage4.1.fuse_layers.1.3.1", true); auto relu_1307 = network->addActivation(*add_1306->getOutput(0), ActivationType::kRELU); auto relu_1310 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.2.0.0.0", "stage4.1.fuse_layers.2.0.0.1"); auto bn_1312 = convBnRelu(network, weightMap, *relu_1310->getOutput(0), width * 4, 3, 2, 1, "stage4.1.fuse_layers.2.0.1.0", "stage4.1.fuse_layers.2.0.1.1", false); auto add_1315 = convBnUpAdd(network, weightMap, *relu_1136->getOutput(0), *bn_1312->getOutput(0), width * 4, 3, 2, 1, "stage4.1.fuse_layers.2.1.0.0", "stage4.1.fuse_layers.2.1.0.1", false); auto add_1316 = network->addElementWise(*relu_1164->getOutput(0), *add_1315->getOutput(0), ElementWiseOperation::kSUM); auto add_1338 = convBnUpAdd(network, weightMap, *relu_1192->getOutput(0), *add_1316->getOutput(0), width * 4, 1, 1, 0, "stage4.1.fuse_layers.2.3.0", "stage4.1.fuse_layers.2.3.1", true); auto relu_1339 = network->addActivation(*add_1338->getOutput(0), ActivationType::kRELU); auto relu_1342 = convBnRelu(network, weightMap, *relu_1108->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.0.0", "stage4.1.fuse_layers.3.0.0.1"); auto relu_1345 = convBnRelu(network, weightMap, *relu_1342->getOutput(0), width, 3, 2, 1, "stage4.1.fuse_layers.3.0.1.0", "stage4.1.fuse_layers.3.0.1.1"); auto bn_1347 = convBnRelu(network, weightMap, *relu_1345->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.0.2.0", "stage4.1.fuse_layers.3.0.2.1", false); auto relu_1350 = convBnRelu(network, weightMap, *relu_1136->getOutput(0), width * 2, 3, 2, 1, "stage4.1.fuse_layers.3.1.0.0", "stage4.1.fuse_layers.3.1.0.1"); auto add_1353 = convBnUpAdd(network, weightMap, *relu_1350->getOutput(0), *bn_1347->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.1.1.0", "stage4.1.fuse_layers.3.1.1.1", false); auto add_1356 = convBnUpAdd(network, weightMap, *relu_1164->getOutput(0), *add_1353->getOutput(0), width * 8, 3, 2, 1, "stage4.1.fuse_layers.3.2.0.0", "stage4.1.fuse_layers.3.2.0.1", false); auto add_1357 = network->addElementWise(*relu_1192->getOutput(0), *add_1356->getOutput(0), ElementWiseOperation::kSUM); auto relu_1358 = network->addActivation(*add_1357->getOutput(0), ActivationType::kRELU); auto relu_1365 = liteResBlock(network, weightMap, *relu_1259->getOutput(0), width, "stage4.2.branches.0.0"); auto relu_1372 = liteResBlock(network, weightMap, *relu_1365->getOutput(0), width, "stage4.2.branches.0.1"); auto relu_1379 = liteResBlock(network, weightMap, *relu_1372->getOutput(0), width, "stage4.2.branches.0.2"); auto relu_1386 = liteResBlock(network, weightMap, *relu_1379->getOutput(0), width, "stage4.2.branches.0.3"); auto relu_1393 = liteResBlock(network, weightMap, *relu_1307->getOutput(0), width * 2, "stage4.2.branches.1.0"); auto relu_1400 = liteResBlock(network, weightMap, *relu_1393->getOutput(0), width * 2, "stage4.2.branches.1.1"); auto relu_1407 = liteResBlock(network, weightMap, *relu_1400->getOutput(0), width * 2, "stage4.2.branches.1.2"); auto relu_1414 = liteResBlock(network, weightMap, *relu_1407->getOutput(0), width * 2, "stage4.2.branches.1.3"); auto relu_1421 = liteResBlock(network, weightMap, *relu_1339->getOutput(0), width * 4, "stage4.2.branches.2.0"); auto relu_1428 = liteResBlock(network, weightMap, *relu_1421->getOutput(0), width * 4, "stage4.2.branches.2.1"); auto relu_1435 = liteResBlock(network, weightMap, *relu_1428->getOutput(0), width * 4, "stage4.2.branches.2.2"); auto relu_1442 = liteResBlock(network, weightMap, *relu_1435->getOutput(0), width * 4, "stage4.2.branches.2.3"); auto relu_1449 = liteResBlock(network, weightMap, *relu_1358->getOutput(0), width * 8, "stage4.2.branches.3.0"); auto relu_1456 = liteResBlock(network, weightMap, *relu_1449->getOutput(0), width * 8, "stage4.2.branches.3.1"); auto relu_1463 = liteResBlock(network, weightMap, *relu_1456->getOutput(0), width * 8, "stage4.2.branches.3.2"); auto relu_1470 = liteResBlock(network, weightMap, *relu_1463->getOutput(0), width * 8, "stage4.2.branches.3.3"); auto add_1492 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *relu_1386->getOutput(0), width, 1, 1, 0, "stage4.2.fuse_layers.0.1.0", "stage4.2.fuse_layers.0.1.1", true); auto add_1514 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1492->getOutput(0), width, 1, 1, 0, "stage4.2.fuse_layers.0.2.0", "stage4.2.fuse_layers.0.2.1", true); auto add_1536 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1514->getOutput(0), width, 1, 1, 0, "stage4.2.fuse_layers.0.3.0", "stage4.2.fuse_layers.0.3.1", true); auto relu_1537 = network->addActivation(*add_1536->getOutput(0), ActivationType::kRELU); auto add_1540 = convBnUpAdd(network, weightMap, *relu_1386->getOutput(0), *relu_1414->getOutput(0), width * 2, 3, 2, 1, "stage4.2.fuse_layers.1.0.0.0", "stage4.2.fuse_layers.1.0.0.1", false); auto add_1562 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1540->getOutput(0), width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.2.0", "stage4.2.fuse_layers.1.2.1", true); auto add_1584 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1562->getOutput(0), width * 2, 1, 1, 0, "stage4.2.fuse_layers.1.3.0", "stage4.2.fuse_layers.1.3.1", true); auto relu_1585 = network->addActivation(*add_1584->getOutput(0), ActivationType::kRELU); auto relu_1588 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.2.0.0.0", "stage4.2.fuse_layers.2.0.0.1"); auto bn_1590 = convBnRelu(network, weightMap, *relu_1588->getOutput(0), width * 4, 3, 2, 1, "stage4.2.fuse_layers.2.0.1.0", "stage4.2.fuse_layers.2.0.1.1", false); auto add_1593 = convBnUpAdd(network, weightMap, *relu_1414->getOutput(0), *bn_1590->getOutput(0), width * 4, 3, 2, 1, "stage4.2.fuse_layers.2.1.0.0", "stage4.2.fuse_layers.2.1.0.1", false); auto add_1594 = network->addElementWise(*relu_1442->getOutput(0), *add_1593->getOutput(0), ElementWiseOperation::kSUM); auto add_1616 = convBnUpAdd(network, weightMap, *relu_1470->getOutput(0), *add_1594->getOutput(0), width * 4, 1, 1, 0, "stage4.2.fuse_layers.2.3.0", "stage4.2.fuse_layers.2.3.1", true); auto relu_1617 = network->addActivation(*add_1616->getOutput(0), ActivationType::kRELU); auto relu_1620 = convBnRelu(network, weightMap, *relu_1386->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.0.0", "stage4.2.fuse_layers.3.0.0.1"); auto relu_1623 = convBnRelu(network, weightMap, *relu_1620->getOutput(0), width, 3, 2, 1, "stage4.2.fuse_layers.3.0.1.0", "stage4.2.fuse_layers.3.0.1.1"); auto bn_1625 = convBnRelu(network, weightMap, *relu_1623->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.0.2.0", "stage4.2.fuse_layers.3.0.2.1", false); auto relu_1628 = convBnRelu(network, weightMap, *relu_1414->getOutput(0), width * 2, 3, 2, 1, "stage4.2.fuse_layers.3.1.0.0", "stage4.2.fuse_layers.3.1.0.1"); auto add_1631 = convBnUpAdd(network, weightMap, *relu_1628->getOutput(0), *bn_1625->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.1.1.0", "stage4.2.fuse_layers.3.1.1.1", false); auto add_1634 = convBnUpAdd(network, weightMap, *relu_1442->getOutput(0), *add_1631->getOutput(0), width * 8, 3, 2, 1, "stage4.2.fuse_layers.3.2.0.0", "stage4.2.fuse_layers.3.2.0.1", false); auto add_1635 = network->addElementWise(*relu_1470->getOutput(0), *add_1634->getOutput(0), ElementWiseOperation::kSUM); auto relu_1636 = network->addActivation(*add_1635->getOutput(0), ActivationType::kRELU); nvinfer1::Dims dim = relu_1537->getOutput(0)->getDimensions(); dim.d[0] = relu_1585->getOutput(0)->getDimensions().d[0]; auto resize_1655 = netAddUpsampleBi(network, relu_1585->getOutput(0), dim); dim.d[0] = relu_1617->getOutput(0)->getDimensions().d[0]; auto resize_1668 = netAddUpsampleBi(network, relu_1617->getOutput(0), dim); dim.d[0] = relu_1636->getOutput(0)->getDimensions().d[0]; auto resize_1681 = netAddUpsampleBi(network, relu_1636->getOutput(0), dim); ITensor *concatTensors[] = {relu_1537->getOutput(0), resize_1655->getOutput(0), resize_1668->getOutput(0), resize_1681->getOutput(0)}; auto concat_1682 = network->addConcatenation(concatTensors, 4); concat_1682->setAxis(0); auto relu_1685 = convBnRelu(network, weightMap, *concat_1682->getOutput(0), width * 15, 1, 1, 0, "aux_head.0", "aux_head.1", true, true); auto conv_1686 = network->addConvolutionNd(*relu_1685->getOutput(0), NUM_CLASSES, DimsHW{1, 1}, weightMap["aux_head.3.weight"], weightMap["aux_head.3.bias"]); conv_1686->setStrideNd(DimsHW{1, 1}); conv_1686->setPaddingNd(DimsHW{0, 0}); auto reshape_1701 = network->addShuffle(*conv_1686->getOutput(0)); nvinfer1::Dims reshape_dim; reshape_dim.nbDims = 2; reshape_dim.d[0] = NUM_CLASSES; reshape_dim.d[1] = -1; reshape_1701->setReshapeDimensions(reshape_dim); auto softmax_1714 = network->addSoftMax(*reshape_1701->getOutput(0)); softmax_1714->setAxes(2); auto relu_1689 = convBnRelu(network, weightMap, *concat_1682->getOutput(0), 512, 3, 1, 1, "conv3x3_ocr.0", "conv3x3_ocr.1", true, true); auto reshape_1710 = network->addShuffle(*relu_1689->getOutput(0)); nvinfer1::Dims reshape_dim1; reshape_dim1.nbDims = 2; reshape_dim1.d[0] = 512; reshape_dim1.d[1] = -1; reshape_1710->setReshapeDimensions(reshape_dim1); nvinfer1::Permutation permutation1; permutation1.order[0] = 1; permutation1.order[1] = 0; reshape_1710->setSecondTranspose(permutation1); auto matmul_1715 = network->addMatrixMultiply(*softmax_1714->getOutput(0), MatrixOperation::kNONE, *reshape_1710->getOutput(0), MatrixOperation::kNONE); auto transpose_1716 = network->addShuffle(*matmul_1715->getOutput(0)); nvinfer1::Permutation permutation2; permutation2.order[0] = 1; permutation2.order[1] = 0; transpose_1716->setFirstTranspose(permutation2); auto unsqueeze_1717 = network->addShuffle(*transpose_1716->getOutput(0)); nvinfer1::Dims reshape_dim3; reshape_dim3.nbDims = 3; reshape_dim3.d[0] = 512; reshape_dim3.d[1] = NUM_CLASSES; reshape_dim3.d[2] = 1; unsqueeze_1717->setReshapeDimensions(reshape_dim3); auto relu_1737 = convBnRelu(network, weightMap, *unsqueeze_1717->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_object.0", "ocr_distri_head.object_context_block.f_object.1.0", true, true); auto relu_1740 = convBnRelu(network, weightMap, *relu_1737->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_object.2", "ocr_distri_head.object_context_block.f_object.3.0", true, true); auto reshape_1747 = network->addShuffle(*relu_1740->getOutput(0)); nvinfer1::Dims reshape_dim4; reshape_dim4.nbDims = 2; reshape_dim4.d[0] = 256; reshape_dim4.d[1] = -1; reshape_1747->setReshapeDimensions(reshape_dim4); auto relu_1723 = convBnRelu(network, weightMap, *relu_1689->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_pixel.0", "ocr_distri_head.object_context_block.f_pixel.1.0", true, true); auto relu_1726 = convBnRelu(network, weightMap, *relu_1723->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_pixel.2", "ocr_distri_head.object_context_block.f_pixel.3.0", true, true); auto reshape_1733 = network->addShuffle(*relu_1726->getOutput(0)); nvinfer1::Dims reshape_dim5; reshape_dim5.nbDims = 2; reshape_dim5.d[0] = 256; reshape_dim5.d[1] = -1; reshape_1733->setReshapeDimensions(reshape_dim5); nvinfer1::Permutation permutation3; permutation3.order[0] = 1; permutation3.order[1] = 0; reshape_1733->setSecondTranspose(permutation3); auto matmul_1759 = network->addMatrixMultiply(*reshape_1733->getOutput(0), MatrixOperation::kNONE, *reshape_1747->getOutput(0), MatrixOperation::kNONE); nvinfer1::Dims constant_dim; constant_dim.nbDims = 2; int allNum = INPUT_H * INPUT_W / 16; constant_dim.d[0] = INPUT_H * INPUT_W / 16; constant_dim.d[1] = 1; Weights wgt{DataType::kFLOAT, nullptr, allNum}; float *w = new float[allNum]; for (int i = 0; i < allNum; i++) { w[i] = 0.0625; } wgt.values = w; auto constant_1761 = network->addConstant(constant_dim, wgt); auto mul_1761 = network->addElementWise(*constant_1761->getOutput(0), *matmul_1759->getOutput(0), ElementWiseOperation::kPROD); auto softmax_1762 = network->addSoftMax(*mul_1761->getOutput(0)); softmax_1762->setAxes(2); auto relu_1750 = convBnRelu(network, weightMap, *unsqueeze_1717->getOutput(0), 256, 1, 1, 0, "ocr_distri_head.object_context_block.f_down.0", "ocr_distri_head.object_context_block.f_down.1.0", true, true); auto reshape_1757 = network->addShuffle(*relu_1750->getOutput(0)); nvinfer1::Dims reshape_dim6; reshape_dim6.nbDims = 2; reshape_dim6.d[0] = 256; reshape_dim6.d[1] = -1; reshape_1757->setReshapeDimensions(reshape_dim6); nvinfer1::Permutation permutation4; permutation4.order[0] = 1; permutation4.order[1] = 0; reshape_1757->setSecondTranspose(permutation4); auto matmul_1763 = network->addMatrixMultiply(*softmax_1762->getOutput(0), MatrixOperation::kNONE, *reshape_1757->getOutput(0), MatrixOperation::kNONE); auto reshape_1777 = network->addShuffle(*matmul_1763->getOutput(0)); nvinfer1::Dims reshape_dim7; reshape_dim7.nbDims = 3; reshape_dim7.d[0] = 256; reshape_dim7.d[1] = INPUT_H / 4; reshape_dim7.d[2] = INPUT_W / 4; reshape_1777->setReshapeDimensions(reshape_dim7); nvinfer1::Permutation permutation5; permutation5.order[0] = 1; permutation5.order[1] = 0; reshape_1777->setFirstTranspose(permutation5); auto relu_1780 = convBnRelu(network, weightMap, *reshape_1777->getOutput(0), 512, 1, 1, 0, "ocr_distri_head.object_context_block.f_up.0", "ocr_distri_head.object_context_block.f_up.1.0", true, true); ITensor *concatTensors1[] = {relu_1780->getOutput(0), relu_1689->getOutput(0)}; auto concat_1781 = network->addConcatenation(concatTensors1, 2); auto relu_1784 = convBnRelu(network, weightMap, *concat_1781->getOutput(0), 512, 1, 1, 0, "ocr_distri_head.conv_bn_dropout.0", "ocr_distri_head.conv_bn_dropout.1.0", true, true); auto conv_1785 = network->addConvolutionNd(*relu_1784->getOutput(0), NUM_CLASSES, DimsHW{1, 1}, weightMap["cls_head.weight"], weightMap["cls_head.bias"]); debug_print(conv_1785->getOutput(0), "cls_head"); dim.nbDims = 3; dim.d[0] = NUM_CLASSES; dim.d[1] = INPUT_H; dim.d[2] = INPUT_W; auto feature_map = netAddUpsampleBi(network, conv_1785->getOutput(0), dim); debug_print(feature_map->getOutput(0), "upsample"); auto topk = network->addTopK(*feature_map->getOutput(0), TopKOperation::kMAX, 1, 0X01); debug_print(topk->getOutput(0), "topk"); std::cout << "set name out" << std::endl; topk->getOutput(1)->setName(OUTPUT_BLOB_NAME); network->markOutput(*topk->getOutput(1)); builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize((1 << 30)); // 1G #ifdef USE_FP16 std::cout << "use fp16" << std::endl; config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build success!" << std::endl; network->destroy(); for (auto &mem : weightMap) { free((void *)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, std::string wtsPath, int width) { IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath, width); assert(engine != nullptr); (*modelStream) = engine->serialize(); engine->destroy(); builder->destroy(); } bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, int &width, std::string &img_dir) { if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); width = std::stoi(argv[4]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void doInference(IExecutionContext &context, cudaStream_t &stream, void **buffers, int batchSize) { context.enqueue(batchSize, buffers, stream, nullptr); cudaStreamSynchronize(stream); cudaDeviceSynchronize(); } int main(int argc, char **argv) { cudaSetDevice(DEVICE); std::string wtsPath = ""; std::string engine_name = ""; int width; std::string img_dir; // parse args if (!parse_args(argc, argv, wtsPath, engine_name, width, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./hrnet_ocr -s [.wts] [.engine] [18 or 32 or 48] // serialize model to plan file" << std::endl; std::cerr << "./hrnet_ocr -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream if (!wtsPath.empty()) { IHostMemory *modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream, wtsPath, width); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } // deserialize the .engine and run inference char *trtModelStream{nullptr}; size_t size{0}; std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } else { std::cerr << "could not open plan file" << std::endl; } std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- cudaSetDeviceFlags(cudaDeviceMapHost); float *data; int *prob; // using int. output is index CHECK(cudaHostAlloc((void **)&data, BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float), cudaHostAllocMapped)); CHECK(cudaHostAlloc((void **)&prob, BATCH_SIZE * OUTPUT_SIZE * sizeof(int), cudaHostAllocMapped)); IRuntime *runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext *context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; void *buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); for (int f = 0; f < (int)file_names.size(); f++) { std::cout << file_names[f] << std::endl; cv::Mat pr_img; cv::Mat img_BGR = cv::imread(img_dir + "/" + file_names[f], 1); // BGR cv::Mat img; cv::cvtColor(img_BGR, img, cv::COLOR_BGR2RGB); if (img.empty()) continue; cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H)); img = pr_img.clone(); // for img show pr_img.convertTo(pr_img, CV_32FC3); if (!pr_img.isContinuous()) { pr_img = pr_img.clone(); } std::memcpy(data, pr_img.data, BATCH_SIZE * 3 * INPUT_W * INPUT_H * sizeof(float)); cudaHostGetDevicePointer((void **)&buffers[inputIndex], (void *)data, 0); // buffers[inputIndex]-->data cudaHostGetDevicePointer((void **)&buffers[outputIndex], (void *)prob, 0); // buffers[outputIndex] --> prob // Run inference auto start = std::chrono::high_resolution_clock::now(); doInference(*context, stream, buffers, BATCH_SIZE); auto end = std::chrono::high_resolution_clock::now(); std::cout << "infer time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; cv::Mat outimg(INPUT_H, INPUT_W, CV_8UC1); for (int row = 0; row < INPUT_H; ++row) { uchar *uc_pixel = outimg.data + row * outimg.step; for (int col = 0; col < INPUT_W; ++col) { uc_pixel[col] = (uchar)prob[row * INPUT_W + col]; } } cv::Mat im_color; cv::cvtColor(outimg, im_color, cv::COLOR_GRAY2RGB); cv::Mat lut = createLTU(NUM_CLASSES); cv::LUT(im_color, lut, im_color); // false color cv::cvtColor(im_color, im_color, cv::COLOR_RGB2GRAY); cv::applyColorMap(im_color, im_color, cv::COLORMAP_HOT); // cv::imshow("False Color Map", im_color); cv::imwrite(std::to_string(f) + "_false_color_map.png", im_color); //fusion cv::Mat fusionImg; cv::addWeighted(img, 1, im_color, 0.8, 1, fusionImg); // cv::imshow("Fusion Img", fusionImg); // cv::waitKey(0); cv::imwrite(std::to_string(f) + "_fusion_img.png", fusionImg); } // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFreeHost(buffers[inputIndex])); CHECK(cudaFreeHost(buffers[outputIndex])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: hrnet/hrnet-semantic-segmentation/hrnet_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences for hrnet. """ import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt from imgaug import augmenters as iaa def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret class Hrnet_TRT(object): """ description: A Hrnet class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.cfx = cuda.Device(0).make_context() stream = cuda.Stream() runtime = trt.Runtime(trt.Logger(trt.Logger.INFO)) assert runtime # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-2] self.input_h = engine.get_binding_shape(binding)[-3] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, image_raw): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.cfx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings print('ori_shape: ', image_raw.shape) # if image_raw is constant, image_raw.shape[1] != self.input_w w_ori, h_ori = image_raw.shape[1], image_raw.shape[0] # Do image preprocess input_image = self.preprocess_image(image_raw) # Copy input image to host buffer np.copyto(host_inputs[0], input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.cfx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess output = output.reshape(self.input_h, self.input_w).astype('uint8') print('output_shape: ', output.shape) output = cv2.resize(output, (w_ori, h_ori)) return output, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.cfx.pop() def preprocess_image(self, image_raw): """ description: Read an image from image path, convert it to RGB, resize and pad it to target size. param: image_raw: numpy, raw image return: image: the processed image """ image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) resize = iaa.Resize({ 'width': self.input_w, 'height': self.input_h }) image = resize.augment_image(image) print('resized', image.shape, image.dtype) image = image.astype(np.float32) return image def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: return cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): return np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) class inferThread(threading.Thread): def __init__(self, hrnet_wrapper, image_path_batch): threading.Thread.__init__(self) self.hrnet_wrapper = hrnet_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.hrnet_wrapper.infer(self.hrnet_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw*255) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, hrnet_wrapper): threading.Thread.__init__(self) self.hrnet_wrapper = hrnet_wrapper def run(self): batch_image_raw, use_time = self.hrnet_wrapper.infer(self.hrnet_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom engine engine_file_path = "build/hrnet.engine" # the generated engine file if len(sys.argv) > 1: engine_file_path = sys.argv[1] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a hrnet instance hrnet_wrapper = Hrnet_TRT(engine_file_path) try: print('batch size is', hrnet_wrapper.batch_size) # batch size is set to 1! image_dir = "samples/" image_path_batches = get_img_path_batches(hrnet_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(hrnet_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(hrnet_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance hrnet_wrapper.destroy() ================================================ FILE: hrnet/hrnet-semantic-segmentation/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: ibnnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(IBNNet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB SOURCE_FILES "*.h" "*.cpp") add_executable(ibnnet ${SOURCE_FILES}) target_link_libraries(ibnnet nvinfer) target_link_libraries(ibnnet cudart) target_link_libraries(ibnnet ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: ibnnet/InferenceEngine.cpp ================================================ #include "InferenceEngine.h" namespace trt { InferenceEngine::InferenceEngine(const EngineConfig &enginecfg): _engineCfg(enginecfg) { assert(_engineCfg.max_batch_size > 0); CHECK(cudaSetDevice(_engineCfg.device_id)); _runtime = make_holder(nvinfer1::createInferRuntime(gLogger)); assert(_runtime); _engine = make_holder(_runtime->deserializeCudaEngine(_engineCfg.trtModelStream.get(), _engineCfg.stream_size)); assert(_engine); _context = make_holder(_engine->createExecutionContext()); assert(_context); _inputSize = _engineCfg.max_batch_size * 3 * _engineCfg.input_h * _engineCfg.input_w * _depth; _outputSize = _engineCfg.max_batch_size * _engineCfg.output_size * _depth; CHECK(cudaMallocHost((void**)&_data, _inputSize)); CHECK(cudaMallocHost((void**)&_prob, _outputSize)); _streamptr = std::shared_ptr( new cudaStream_t, [](cudaStream_t* ptr){ cudaStreamDestroy(*ptr); if(ptr != nullptr){ delete ptr; } }); CHECK(cudaStreamCreate(&*_streamptr.get())); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(_engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() _inputIndex = _engine->getBindingIndex(_engineCfg.input_name); _outputIndex = _engine->getBindingIndex(_engineCfg.output_name); // Create GPU buffers on device CHECK(cudaMalloc(&_buffers[_inputIndex], _inputSize)); CHECK(cudaMalloc(&_buffers[_outputIndex], _outputSize)); _inputSize /= _engineCfg.max_batch_size; _outputSize /= _engineCfg.max_batch_size; } bool InferenceEngine::doInference(const int inference_batch_size, std::function preprocessing) { assert(inference_batch_size <= _engineCfg.max_batch_size); preprocessing(_data); CHECK(cudaSetDevice(_engineCfg.device_id)); CHECK(cudaMemcpyAsync(_buffers[_inputIndex], _data, inference_batch_size * _inputSize, cudaMemcpyHostToDevice, *_streamptr)); auto status = _context->enqueue(inference_batch_size, _buffers, *_streamptr, nullptr); CHECK(cudaMemcpyAsync(_prob, _buffers[_outputIndex], inference_batch_size * _outputSize, cudaMemcpyDeviceToHost, *_streamptr)); CHECK(cudaStreamSynchronize(*_streamptr)); return status; } InferenceEngine::InferenceEngine(InferenceEngine &&other) noexcept: _engineCfg(other._engineCfg) , _data(other._data) , _prob(other._prob) , _inputIndex(other._inputIndex) , _outputIndex(other._outputIndex) , _inputSize(other._inputSize) , _outputSize(other._outputSize) , _runtime(std::move(other._runtime)) , _engine(std::move(other._engine)) , _context(std::move(other._context)) , _streamptr(other._streamptr) { _buffers[0] = other._buffers[0]; _buffers[1] = other._buffers[1]; other._streamptr.reset(); other._data = nullptr; other._prob = nullptr; other._buffers[0] = nullptr; other._buffers[1] = nullptr; } InferenceEngine::~InferenceEngine() { CHECK(cudaFreeHost(_data)); CHECK(cudaFreeHost(_prob)); CHECK(cudaFree(_buffers[_inputIndex])); CHECK(cudaFree(_buffers[_outputIndex])); } } ================================================ FILE: ibnnet/InferenceEngine.h ================================================ /************************************************************************** * Handle memory pre-alloc * both on host(pinned memory, allow CUDA DMA) & device *************************************************************************/ #pragma once #include #include #include #include #include #include "utils.h" #include "holder.h" #include "logging.h" #include "NvInfer.h" #include "cuda_runtime_api.h" static Logger gLogger; namespace trt { struct EngineConfig { const char* input_name; const char* output_name; std::shared_ptr trtModelStream; int max_batch_size; /* create engine */ int input_h; int input_w; int output_size; int stream_size; int device_id; }; class InferenceEngine { public: InferenceEngine(const EngineConfig &enginecfg); InferenceEngine(InferenceEngine &&other) noexcept; ~InferenceEngine(); InferenceEngine(const InferenceEngine &) = delete; InferenceEngine& operator=(const InferenceEngine &) = delete; InferenceEngine& operator=(InferenceEngine && other) = delete; bool doInference(const int inference_batch_size, std::function preprocessing); float* getOutput() { return _prob; } std::thread::id getThreadID() { return std::this_thread::get_id(); } private: EngineConfig _engineCfg; float* _data{nullptr}; float* _prob{nullptr}; // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. void* _buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() int _inputIndex; int _outputIndex; int _inputSize; int _outputSize; static constexpr std::size_t _depth{sizeof(float)}; TensorRTHolder _runtime{nullptr}; TensorRTHolder _engine{nullptr}; TensorRTHolder _context{nullptr}; std::shared_ptr _streamptr; }; } ================================================ FILE: ibnnet/README.md ================================================ # IBN-Net An implementation of IBN-Net, proposed in ["Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net"](https://arxiv.org/abs/1807.09441), ECCV2018 by Xingang Pan, Ping Luo, Jianping Shi, Xiaoou Tang. For the Pytorch implementation, you can refer to [IBN-Net](https://github.com/XingangPan/IBN-Net) ## Features - InstanceNorm2d - bottleneck_ibn - Resnet50-IBNA - Resnet50-IBNB - Multi-thread inference ## How to Run * 1. generate .wts // for ibn-a ``` python gen_wts.py a ``` a file 'resnet50-ibna.wts' will be generated. // for ibn-b ``` python gen_wts.py b ``` a file 'resnet50-ibnb.wts' will be generated. * 2. cmake and make ``` mkdir build cd build cmake .. make ``` * 3. build engine and run classification // put resnet50-ibna.wts/resnet50-ibnb.wts into tensorrtx/ibnnet // go to tensorrtx/ibnnet ``` ./ibnnet -s // serialize model to plan file ./ibnnet -d // deserialize plan file and run inference ``` ================================================ FILE: ibnnet/gen_wts.py ================================================ import torch import os import sys import struct assert sys.argv[1] == "a" or sys.argv[1] == "b" model_name = "resnet50_ibn_" + sys.argv[1] net = torch.hub.load('XingangPan/IBN-Net', model_name, pretrained=True).to('cuda:0').eval() #verify #input = torch.ones(1, 3, 224, 224).to('cuda:0') #pixel_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1).to('cuda:0') #pixel_std = torch.tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1).to('cuda:0') #input.sub_(pixel_mean).div_(pixel_std) #out = net(input) #print(out) f = open(model_name + ".wts", 'w') f.write("{}\n".format(len(net.state_dict().keys()))) for k,v in net.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") ================================================ FILE: ibnnet/holder.h ================================================ #pragma once template class TensorRTHolder { T* holder; public: explicit TensorRTHolder(T* holder_) : holder(holder_) {} ~TensorRTHolder() { if (holder) holder->destroy(); } TensorRTHolder(const TensorRTHolder&) = delete; TensorRTHolder& operator=(const TensorRTHolder&) = delete; TensorRTHolder(TensorRTHolder && rhs) noexcept{ holder = rhs.holder; rhs.holder = nullptr; } TensorRTHolder& operator=(TensorRTHolder&& rhs) noexcept { if (this == &rhs) { return *this; } if (holder) holder->destroy(); holder = rhs.holder; rhs.holder = nullptr; return *this; } T* operator->() { return holder; } T* get() { return holder; } explicit operator bool() { return holder != nullptr; } T& operator*() noexcept { return *holder; } }; template TensorRTHolder make_holder(T* holder) { return TensorRTHolder(holder); } template using TensorRTNonHolder = T*; ================================================ FILE: ibnnet/ibnnet.cpp ================================================ #include "ibnnet.h" //#define USE_FP16 namespace trt { IBNNet::IBNNet(trt::EngineConfig &enginecfg, const IBN ibn) : _engineCfg(enginecfg) { switch(ibn) { case IBN::A: _ibn = "a"; break; case IBN::B: _ibn = "b"; break; case IBN::NONE: default: _ibn = ""; break; } } // create the engine using only the API and not any parser. ICudaEngine *IBNNet::createEngine(IBuilder* builder, IBuilderConfig* config) { // resnet50-ibna, resnet50-ibnb, resnet50 assert(_ibn == "a" or _ibn == "b" or _ibn == ""); INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(_engineCfg.input_name, _dt, Dims3{3, _engineCfg.input_h, _engineCfg.input_w}); assert(data); std::string path; if(_ibn == "") { path = "../resnet50.wts"; } else { path = "../resnet50-ibn" + _ibn + ".wts"; } std::map weightMap = loadWeights(path); Weights emptywts{DataType::kFLOAT, nullptr, 0}; std::map> ibn_layers{ { "a", {"a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "a", "", "", ""}}, { "b", {"", "", "b", "", "", "","b", "", "", "", "", "", "", "", "", "",}}, { "", {16, ""}}}; const float mean[3] = {0.485, 0.456, 0.406}; // rgb const float std[3] = {0.229, 0.224, 0.225}; ITensor* pre_input = MeanStd(network, weightMap, data, "", mean, std, false); IConvolutionLayer* conv1 = network->addConvolutionNd(*pre_input, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IActivationLayer* relu1{nullptr}; if (_ibn == "b") { IScaleLayer* bn1 = addInstanceNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); } else { IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); } assert(relu1); // Add max pooling layer with stride of 2x2 and kernel size of 2x2. IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* x = bottleneck_ibn(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.", ibn_layers[_ibn][0]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.", ibn_layers[_ibn][1]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.", ibn_layers[_ibn][2]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.", ibn_layers[_ibn][3]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.", ibn_layers[_ibn][4]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.", ibn_layers[_ibn][5]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.", ibn_layers[_ibn][6]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.", ibn_layers[_ibn][7]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.", ibn_layers[_ibn][8]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.", ibn_layers[_ibn][9]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.", ibn_layers[_ibn][10]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.", ibn_layers[_ibn][11]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.", ibn_layers[_ibn][12]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.", ibn_layers[_ibn][13]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.", ibn_layers[_ibn][14]); x = bottleneck_ibn(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.", ibn_layers[_ibn][15]); IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool2); pool2->setStrideNd(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(_engineCfg.output_name); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(_engineCfg.max_batch_size); config->setMaxWorkspaceSize(1 << 20); #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } bool IBNNet::serializeEngine() { // Create builder auto builder = make_holder(createInferBuilder(gLogger)); auto config = make_holder(builder->createBuilderConfig()); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = createEngine(builder.get(), config.get()); assert(engine); // Serialize the engine TensorRTHolder modelStream = make_holder(engine->serialize()); assert(modelStream); std::ofstream p("./ibnnet.engine", std::ios::binary | std::ios::out); if (!p) { std::cerr << "could not open plan output file" << std::endl; return false; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); return true; } bool IBNNet::deserializeEngine() { std::ifstream file("./ibnnet.engine", std::ios::binary | std::ios::in); if (file.good()) { file.seekg(0, file.end); _engineCfg.stream_size = file.tellg(); file.seekg(0, file.beg); _engineCfg.trtModelStream = std::shared_ptr( new char[_engineCfg.stream_size], []( char* ptr ){ delete [] ptr; } ); assert(_engineCfg.trtModelStream.get()); file.read(_engineCfg.trtModelStream.get(), _engineCfg.stream_size); file.close(); _inferEngine = make_unique(_engineCfg); return true; } return false; } void IBNNet::preprocessing(const cv::Mat& img, float* const data, const std::size_t stride) { for (std::size_t i = 0; i < stride; ++i) { data[i] = img.at(i)[2] / 255.0; data[i + stride] = img.at(i)[1] / 255.0; data[i + (stride<<1)] = img.at(i)[0] / 255.0; } } bool IBNNet::inference(std::vector &input) { if(_inferEngine != nullptr) { const std::size_t stride = _engineCfg.input_w * _engineCfg.input_h; return _inferEngine.get()->doInference(input.size(), [&](float* data) { for(const auto &img : input) { preprocessing(img, data, stride); data += 3 * stride; } } ); } else { return false; } } float* IBNNet::getOutput() { if(_inferEngine != nullptr) return _inferEngine.get()->getOutput(); return nullptr; } int IBNNet::getDeviceID() { return _engineCfg.device_id; } } ================================================ FILE: ibnnet/ibnnet.h ================================================ #pragma once #include "utils.h" #include "holder.h" #include "layers.h" #include "InferenceEngine.h" #include #include #include #include extern Logger gLogger; using namespace trtxapi; namespace trt { enum IBN { A, // resnet50-ibna, B, // resnet50-ibnb, NONE // resnet50 }; class IBNNet { public: IBNNet(trt::EngineConfig &enginecfg, const IBN ibn); ~IBNNet() {}; bool serializeEngine(); /* create & serializeEngine */ bool deserializeEngine(); bool inference(std::vector &input); /* support batch inference */ float* getOutput(); int getDeviceID(); /* cuda deviceid */ private: ICudaEngine *createEngine(IBuilder *builder, IBuilderConfig *config); void preprocessing(const cv::Mat& img, float* const data, const std::size_t stride); private: trt::EngineConfig _engineCfg; std::unique_ptr _inferEngine{nullptr}; std::string _ibn; DataType _dt{DataType::kFLOAT}; }; } ================================================ FILE: ibnnet/layers.cpp ================================================ #include "layers.h" namespace trtxapi { ITensor* MeanStd(INetworkDefinition *network, std::map& weightMap, ITensor* input, const std::string lname, const float* mean, const float* std, const bool div255) { if(div255) { Weights Div_225{ DataType::kFLOAT, nullptr, 3 }; float *wgt = reinterpret_cast(malloc(sizeof(float) * 3)); std::fill_n(wgt, 3, 255.0f); Div_225.values = wgt; weightMap[lname + ".div"] = Div_225; IConstantLayer* d = network->addConstant(Dims3{ 3, 1, 1 }, Div_225); input = network->addElementWise(*input, *d->getOutput(0), ElementWiseOperation::kDIV)->getOutput(0); } Weights Mean{ DataType::kFLOAT, nullptr, 3 }; Mean.values = mean; IConstantLayer* m = network->addConstant(Dims3{ 3, 1, 1 }, Mean); IElementWiseLayer* sub_mean = network->addElementWise(*input, *m->getOutput(0), ElementWiseOperation::kSUB); if (std != nullptr) { Weights Std{ DataType::kFLOAT, nullptr, 3 }; Std.values = std; IConstantLayer* s = network->addConstant(Dims3{ 3, 1, 1 }, Std); IElementWiseLayer* std_mean = network->addElementWise(*sub_mean->getOutput(0), *s->getOutput(0), ElementWiseOperation::kDIV); return std_mean->getOutput(0); } else { return sub_mean->getOutput(0); } } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string lname, const float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights wscale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights wshift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights wpower{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = wscale; weightMap[lname + ".shift"] = wshift; weightMap[lname + ".power"] = wpower; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, wshift, wscale, wpower); assert(scale_1); return scale_1; } IScaleLayer* addInstanceNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string lname, const float eps) { int len = weightMap[lname + ".weight"].count; IReduceLayer* reduce1 = network->addReduce(input, ReduceOperation::kAVG, 6, true); assert(reduce1); IElementWiseLayer* ew1 = network->addElementWise(input, *reduce1->getOutput(0), ElementWiseOperation::kSUB); assert(ew1); const static float pval1[3]{0.0, 1.0, 2.0}; Weights wshift1{DataType::kFLOAT, pval1, 1}; Weights wscale1{DataType::kFLOAT, pval1+1, 1}; Weights wpower1{DataType::kFLOAT, pval1+2, 1}; IScaleLayer* scale1 = network->addScale( *ew1->getOutput(0), ScaleMode::kUNIFORM, wshift1, wscale1, wpower1); assert(scale1); IReduceLayer* reduce2 = network->addReduce( *scale1->getOutput(0), ReduceOperation::kAVG, 6, true); assert(reduce2); const static float pval2[3]{eps, 1.0, 0.5}; Weights wshift2{DataType::kFLOAT, pval2, 1}; Weights wscale2{DataType::kFLOAT, pval2+1, 1}; Weights wpower2{DataType::kFLOAT, pval2+2, 1}; IScaleLayer* scale2 = network->addScale( *reduce2->getOutput(0), ScaleMode::kUNIFORM, wshift2, wscale2, wpower2); assert(scale2); IElementWiseLayer* ew2 = network->addElementWise(*ew1->getOutput(0), *scale2->getOutput(0), ElementWiseOperation::kDIV); assert(ew2); float* pval3 = reinterpret_cast(malloc(sizeof(float) * len)); std::fill_n(pval3, len, 1.0); Weights wpower3{DataType::kFLOAT, pval3, len}; weightMap[lname + ".power3"] = wpower3; IScaleLayer* scale3 = network->addScale( *ew2->getOutput(0), ScaleMode::kCHANNEL, weightMap[lname + ".bias"], weightMap[lname + ".weight"], wpower3); assert(scale3); return scale3; } IConcatenationLayer* addIBN(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string lname) { Dims spliteDims = input.getDimensions(); ISliceLayer *split1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{spliteDims.d[0]/2, spliteDims.d[1], spliteDims.d[2]}, Dims3{1, 1, 1}); assert(split1); ISliceLayer *split2 = network->addSlice(input, Dims3{spliteDims.d[0]/2, 0, 0}, Dims3{spliteDims.d[0]/2, spliteDims.d[1], spliteDims.d[2]}, Dims3{1, 1, 1}); assert(split2); auto in1 = addInstanceNorm2d(network, weightMap, *split1->getOutput(0), lname + "IN", 1e-5); auto bn1 = addBatchNorm2d(network, weightMap, *split2->getOutput(0), lname + "BN", 1e-5); ITensor* tensor1[] = {in1->getOutput(0), bn1->getOutput(0)}; auto cat1 = network->addConcatenation(tensor1, 2); assert(cat1); return cat1; } IActivationLayer* bottleneck_ibn(INetworkDefinition *network, std::map& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); IActivationLayer* relu1{nullptr}; if (ibn == "a") { IConcatenationLayer* bn1 = addIBN(network, weightMap, *conv1->getOutput(0), lname + "bn1."); relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); } else { IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); } IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 4) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3{nullptr}; if (ibn == "b") { IScaleLayer* in1 = addInstanceNorm2d(network, weightMap, *ew1->getOutput(0), lname + "IN", 1e-5); relu3 = network->addActivation(*in1->getOutput(0), ActivationType::kRELU); } else { relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); } assert(relu3); return relu3; } } ================================================ FILE: ibnnet/layers.h ================================================ #pragma once #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" using namespace nvinfer1; namespace trtxapi { ITensor* MeanStd(INetworkDefinition *network, std::map& weightMap, ITensor* input, const std::string lname, const float* mean, const float* std, const bool div255); IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string lname, const float eps); IScaleLayer* addInstanceNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string lname, const float eps); IConcatenationLayer* addIBN(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string lname); IActivationLayer* bottleneck_ibn(INetworkDefinition *network, std::map& weightMap, ITensor& input, const int inch, const int outch, const int stride, const std::string lname, const std::string ibn); } ================================================ FILE: ibnnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: ibnnet/main.cpp ================================================ #include #include #include #include "ibnnet.h" #include "InferenceEngine.h" // stuff we know about the network and the input/output blobs static const int MAX_BATCH_SIZE = 4; static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; static const int DEVICE_ID = 0; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; extern Logger gLogger; void run_infer(std::shared_ptr model) { CHECK(cudaSetDevice(model->getDeviceID())); if(!model->deserializeEngine()) { std::cout << "DeserializeEngine Failed." << std::endl; return; } /* support batch input data */ std::vector input; input.emplace_back( cv::Mat(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(255,255,255)) ) ; /* run inference */ model->inference(input); /* get output data from cudaMalloc */ float* prob = model->getOutput(); /* print output */ std::cout << "\nOutput from thread_id: " << std::this_thread::get_id() << std::endl; if( prob != nullptr ) { for (size_t batch_idx = 0; batch_idx < input.size(); ++batch_idx) { for (int p = 0; p < OUTPUT_SIZE; ++p) { std::cout<< prob[batch_idx+p] << " "; if ((p+1) % 10 == 0) { std::cout << std::endl; } } } } } int main(int argc, char** argv) { trt::EngineConfig engineCfg { INPUT_BLOB_NAME, OUTPUT_BLOB_NAME, nullptr, MAX_BATCH_SIZE, INPUT_H, INPUT_W, OUTPUT_SIZE, 0, DEVICE_ID}; if (argc == 2 && std::string(argv[1]) == "-s") { std::cout << "Serializling Engine" << std::endl; trt::IBNNet ibnnet{engineCfg, trt::IBN::A}; ibnnet.serializeEngine(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { /* * Support multi thread inference (mthreads>1) * Each thread holds their own CudaEngine * They can run on different cuda device through trt::EngineConfig setting */ int mthreads = 1; std::vector workers; std::vector> models; for(int i = 0; i < mthreads; ++i) { models.emplace_back( std::make_shared(engineCfg, trt::IBN::A) ); // For IBNB: trt::IBN::B } for(int i = 0; i < mthreads; ++i) { workers.emplace_back( std::thread(run_infer, models[i]) ); } for(auto & worker : workers) { worker.join(); } return 0; } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./ibnnet -s // serialize model to plan file" << std::endl; std::cerr << "./ibnnet -d // deserialize plan file and run inference" << std::endl; return -1; } } ================================================ FILE: ibnnet/utils.cpp ================================================ #include "utils.h" // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } ================================================ FILE: ibnnet/utils.h ================================================ #pragma once #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "assert.h" #include #include #include using namespace nvinfer1; #define CHECK(status) \ do \ { \ auto ret = (status); \ if (ret != 0) \ { \ std::cout << "Cuda failure: " << ret; \ abort(); \ } \ } while (0) template std::unique_ptr make_unique(Args&&... args) { return std::unique_ptr(new T(std::forward(args)...)); } std::map loadWeights(const std::string file); ================================================ FILE: inception/inceptionv3/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(inception) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(inception ${PROJECT_SOURCE_DIR}/inception_v3.cpp) target_link_libraries(inception nvinfer) target_link_libraries(inception cudart) add_definitions(-O2 -pthread) ================================================ FILE: inception/inceptionv3/README.md ================================================ # Inception v3 Inception v3 model architecture from "Rethinking the Inception Architecture for Computer Vision" . For the details, you can refer to [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception) Following tricks are used in this inception: - For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt. - Batchnorm layer, implemented by scale layer. ``` // 1. generate inception.wts from [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception) // 2. put inception.wts into tensorrtx/inception // 3. build and run cd tensorrtx/inception mkdir build cd build cmake .. make sudo ./inception -s // serialize model to plan file i.e. 'inception.engine' sudo ./inception -d // deserialize plan file and run inference // 4. see if the output is same as pytorchx/inception ``` ================================================ FILE: inception/inceptionv3/inception_v3.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 299; static const int INPUT_W = 299; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* basicConv2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, DimsHW ksize, int s, DimsHW p, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, ksize, weightMap[lname + "conv.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(p); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn", 1e-3); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); return relu1; } IConcatenationLayer* inceptionA(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int pool_proj) { IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1."); IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 48, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch5x5_1."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 64, DimsHW{5, 5}, 1, DimsHW{2, 2}, lname + "branch5x5_2."); IActivationLayer* relu3 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_3."); IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{1, 1}); pool1->setPaddingNd(DimsHW{1, 1}); pool1->setAverageCountExcludesPadding(false); IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), pool_proj, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool."); ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), relu3->getOutput(0), relu4->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 4); assert(cat1); return cat1; } IConcatenationLayer* inceptionB(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 384, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3."); IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 96, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 96, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3dbl_3."); IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), pool1->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 3); assert(cat1); return cat1; } IConcatenationLayer* inceptionC(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int c7) { IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1."); IActivationLayer* relu2 = basicConv2d(network, weightMap, input, c7, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7_1."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), c7, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7_2."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7_3."); IActivationLayer* relu3 = basicConv2d(network, weightMap, input, c7, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7dbl_1."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7dbl_2."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7dbl_3."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), c7, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7dbl_4."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 192, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7dbl_5."); IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{1, 1}); pool1->setPaddingNd(DimsHW{1, 1}); pool1->setAverageCountExcludesPadding(false); IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool."); ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), relu3->getOutput(0), relu4->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 4); assert(cat1); return cat1; } IConcatenationLayer* inceptionD(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3_1."); relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 320, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch3x3_2."); IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch7x7x3_1."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{1, 7}, 1, DimsHW{0, 3}, lname + "branch7x7x3_2."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{7, 1}, 1, DimsHW{3, 0}, lname + "branch7x7x3_3."); relu2 = basicConv2d(network, weightMap, *relu2->getOutput(0), 192, DimsHW{3, 3}, 2, DimsHW{0, 0}, lname + "branch7x7x3_4."); IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); ITensor* inputTensors[] = {relu1->getOutput(0), relu2->getOutput(0), pool1->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 3); assert(cat1); return cat1; } IConcatenationLayer* inceptionE(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 320, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch1x1."); IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 384, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3_1."); IActivationLayer* relu2a = basicConv2d(network, weightMap, *relu2->getOutput(0), 384, DimsHW{1, 3}, 1, DimsHW{0, 1}, lname + "branch3x3_2a."); IActivationLayer* relu2b = basicConv2d(network, weightMap, *relu2->getOutput(0), 384, DimsHW{3, 1}, 1, DimsHW{1, 0}, lname + "branch3x3_2b."); ITensor* inputTensors[] = {relu2a->getOutput(0), relu2b->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors, 2); assert(cat1); IActivationLayer* relu3 = basicConv2d(network, weightMap, input, 448, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch3x3dbl_1."); relu3 = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{3, 3}, 1, DimsHW{1, 1}, lname + "branch3x3dbl_2."); IActivationLayer* relu3a = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{1, 3}, 1, DimsHW{0, 1}, lname + "branch3x3dbl_3a."); IActivationLayer* relu3b = basicConv2d(network, weightMap, *relu3->getOutput(0), 384, DimsHW{3, 1}, 1, DimsHW{1, 0}, lname + "branch3x3dbl_3b."); ITensor* inputTensors1[] = {relu3a->getOutput(0), relu3b->getOutput(0)}; IConcatenationLayer* cat2 = network->addConcatenation(inputTensors1, 2); assert(cat2); IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{1, 1}); pool1->setPaddingNd(DimsHW{1, 1}); pool1->setAverageCountExcludesPadding(false); IActivationLayer* relu4 = basicConv2d(network, weightMap, *pool1->getOutput(0), 192, DimsHW{1, 1}, 1, DimsHW{0, 0}, lname + "branch_pool."); ITensor* inputTensors2[] = {relu1->getOutput(0), cat1->getOutput(0), cat2->getOutput(0), relu4->getOutput(0)}; IConcatenationLayer* cat3 = network->addConcatenation(inputTensors2, 4); assert(cat3); return cat3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../inception.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; float shval[3] = {(0.485 - 0.5) / 0.5, (0.456 - 0.5) / 0.5, (0.406 - 0.5) / 0.5}; float scval[3] = {0.229 / 0.5, 0.224 / 0.5, 0.225 / 0.5}; float pval[3] = {1.0, 1.0, 1.0}; Weights shift{DataType::kFLOAT, shval, 3}; Weights scale{DataType::kFLOAT, scval, 3}; Weights power{DataType::kFLOAT, pval, 3}; IScaleLayer* scale1 = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, power); assert(scale1); IActivationLayer* relu1 = basicConv2d(network, weightMap, *scale1->getOutput(0), 32, DimsHW{3, 3}, 2, DimsHW{0, 0}, "Conv2d_1a_3x3."); relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 32, DimsHW{3, 3}, 1, DimsHW{0, 0}, "Conv2d_2a_3x3."); relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 64, DimsHW{3, 3}, 1, DimsHW{1, 1}, "Conv2d_2b_3x3."); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); relu1 = basicConv2d(network, weightMap, *pool1->getOutput(0), 80, DimsHW{1, 1}, 1, DimsHW{0, 0}, "Conv2d_3b_1x1."); relu1 = basicConv2d(network, weightMap, *relu1->getOutput(0), 192, DimsHW{3, 3}, 1, DimsHW{0, 0}, "Conv2d_4a_3x3."); pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); pool1->setStrideNd(DimsHW{2, 2}); auto cat1 = inceptionA(network, weightMap, *pool1->getOutput(0), "Mixed_5b.", 32); cat1 = inceptionA(network, weightMap, *cat1->getOutput(0), "Mixed_5c.", 64); cat1 = inceptionA(network, weightMap, *cat1->getOutput(0), "Mixed_5d.", 64); cat1 = inceptionB(network, weightMap, *cat1->getOutput(0), "Mixed_6a."); cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6b.", 128); cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6c.", 160); cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6d.", 160); cat1 = inceptionC(network, weightMap, *cat1->getOutput(0), "Mixed_6e.", 192); cat1 = inceptionD(network, weightMap, *cat1->getOutput(0), "Mixed_7a."); cat1 = inceptionE(network, weightMap, *cat1->getOutput(0), "Mixed_7b."); cat1 = inceptionE(network, weightMap, *cat1->getOutput(0), "Mixed_7c."); IPoolingLayer* pool2 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kAVERAGE, DimsHW{8, 8}); assert(pool2); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./inception -s // serialize model to plan file" << std::endl; std::cerr << "./inception -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("inception.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("inception.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { std::cout << prob[i] << ", "; if (i % 10 == 0) std::cout << i / 10 << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: inception/inceptionv3/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: inception/inceptionv4/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(InceptionV4) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB SOURCE_FILES "*.h" "*.cpp") add_executable(inceptionv4 ${SOURCE_FILES}) target_link_libraries(inceptionv4 nvinfer) target_link_libraries(inceptionv4 cudart) target_link_libraries(inceptionv4 ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: inception/inceptionv4/README.md ================================================ # Inception v4 Inception v4 model architecture from "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning" . For the details, you can refer to [rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/inception_v4.py) Following tricks are used in this inception: - For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt. - Batchnorm layer, implemented by scale layer. ``` // 1. generate inception.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz/blob/main/generate_weights.py) // 2. put inception.wts into tensorrtx/inceptionV4 // 3. build and run cd tensorrtx/inception/inceptionV4 mkdir build cd build cmake .. make sudo ./inceptionV4 -s // serialize model to plan file i.e. 'inceptionV4.engine' sudo ./inceptionV4 -d // deserialize plan file and run inference // 4. see if the output is same as rwightman/pytorch-image-models/inceptionv4 ``` ================================================ FILE: inception/inceptionv4/inception_v4.cpp ================================================ # include "inception_v4.h" namespace trtx { InceptionV4::InceptionV4(const InceptionV4Params ¶ms) : mParams(params) , mContext(nullptr) , mEngine(nullptr) { } /** * Builds the tensorrt engine and serializes it. **/ bool InceptionV4::serializeEngine() { // load weights weightMap = loadWeights(mParams.weightsFile); // create builder IBuilder* builder = createInferBuilder(gLogger); assert(builder); // create builder config IBuilderConfig* config = builder -> createBuilderConfig(); assert(config); // create engine bool created = buildEngine(builder, config); if(!created) { std::cerr << "Engine creation failed. Check logs." << std::endl; return false; } // serilaize engine assert(mEngine != nullptr); IHostMemory* modelStream{nullptr}; modelStream = mEngine -> serialize(); assert(modelStream != nullptr); // destroy config -> destroy(); builder -> destroy(); // write serialized engine to file std::ofstream trtFile(mParams.trtEngineFile, std::ios::binary); if(!trtFile){ std::cerr << "Unable to open engine file." << std::endl; return false; } trtFile.write(reinterpret_cast(modelStream -> data()), modelStream -> size()); std::cout << "Engine serialized and saved." << std::endl; // clean modelStream -> destroy(); return true; } bool InceptionV4::buildEngine(IBuilder *builder, IBuilderConfig *config) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_BLOB_NAME ITensor* data = network->addInput(mParams.inputTensorName, dt, Dims3{3, mParams.inputH, mParams.inputW}); assert(data); Weights emptywts{DataType::kFLOAT, nullptr, 0}; float shval[3] = {(0.485 - 0.5) / 0.5, (0.456 - 0.5) / 0.5, (0.406 - 0.5) / 0.5}; float scval[3] = {0.229 / 0.5, 0.224 / 0.5, 0.225 / 0.5}; float pval[3] = {1.0, 1.0, 1.0}; Weights shift{DataType::kFLOAT, shval, 3}; Weights scale{DataType::kFLOAT, scval, 3}; Weights power{DataType::kFLOAT, pval, 3}; IScaleLayer* scale1 = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, power); assert(scale1); IActivationLayer* relu0 = basicConv2d(network, weightMap, *scale1 -> getOutput(0), 32, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, "features.0"); relu0 = basicConv2d(network, weightMap, *relu0 -> getOutput(0), 32, DimsHW{ 3, 3 }, 1, DimsHW{ 0, 0 }, "features.1"); relu0 = basicConv2d(network, weightMap, *relu0 -> getOutput(0), 64, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, "features.2"); auto cat0 = mixed_3a(network, weightMap, *relu0 -> getOutput(0), "features.3"); cat0 = mixed_4a(network, weightMap, *cat0 -> getOutput(0), "features.4"); cat0 = mixed_5a(network, weightMap, *cat0 -> getOutput(0), "features.5"); cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.6"); cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.7"); cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.8"); cat0 = inceptionA(network, weightMap, *cat0 -> getOutput(0), "features.9"); cat0 = reductionA(network, weightMap, *cat0 -> getOutput(0), "features.10"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.11"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.12"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.13"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.14"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.15"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.16"); cat0 = inceptionB(network, weightMap, *cat0 -> getOutput(0), "features.17"); cat0 = reductionB(network, weightMap, *cat0 -> getOutput(0), "features.18"); cat0 = inceptionC(network, weightMap, *cat0 -> getOutput(0), "features.19"); cat0 = inceptionC(network, weightMap, *cat0 -> getOutput(0), "features.20"); cat0 = inceptionC(network, weightMap, *cat0 -> getOutput(0), "features.21"); IPoolingLayer* pool2 = network->addPoolingNd(*cat0->getOutput(0), PoolingType::kAVERAGE, DimsHW{8, 8}); assert(pool2); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["last_linear.weight"], weightMap["last_linear.bias"]); assert(fc1); fc1->getOutput(0)->setName(mParams.outputTensorName); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(mParams.batchSize); config->setMaxWorkspaceSize(1 << 28); if (mParams.fp16) config->setFlag(BuilderFlag::kFP16); mEngine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } if (mEngine == nullptr) return false; return true; } bool InceptionV4::deserializeCudaEngine() { if (mContext != nullptr && mEngine != nullptr) { return true; } if (mEngine == nullptr) { char* trtModelStream{nullptr}; size_t size{0}; // open file std::ifstream f(mParams.trtEngineFile, std::ios::binary); if (f.good()) { // get size f.seekg(0, f.end); size = f.tellg(); f.seekg(0, f.beg); trtModelStream = new char[size]; // read data as a block f.read(trtModelStream, size); f.close(); } if (trtModelStream == nullptr) { return false; } // deserialize IRuntime* runtime = createInferRuntime(gLogger); assert(runtime); mEngine = runtime -> deserializeCudaEngine(trtModelStream, size, 0); assert(mEngine != nullptr); // clean up runtime -> destroy(); delete[] trtModelStream; } std::cout << "deserialized engine successfully." << std::endl; // create execution context mContext = mEngine -> createExecutionContext(); assert(mContext != nullptr); return true; } void InceptionV4::doInference(float* input, float* output, int batchSize) { // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(mEngine -> getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = mEngine->getBindingIndex(mParams.inputTensorName); const int outputIndex = mEngine->getBindingIndex(mParams.outputTensorName); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * mParams.inputH * mParams.inputW * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 1000 * sizeof(float))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * mParams.inputH * mParams.inputW * sizeof(float), cudaMemcpyHostToDevice, stream)); mContext->enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * 1000 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); } /** * Cleans up any state created in the InceptionV4Trt class **/ bool InceptionV4::cleanUp() { if (mContext != nullptr) mContext -> destroy(); if (mEngine != nullptr) mEngine -> destroy(); return true; } } ================================================ FILE: inception/inceptionv4/inception_v4.h ================================================ #ifndef TRTX_INCEPTION_NETWORK_H #define TRTX_INCEPTION_NETWORK_H #include #include #include #include #include "logging.h" #include "utils.h" #include "layers_api.h" static Logger gLogger; using namespace trtxlayers; namespace trtx { struct InceptionV4Params { /* data */ int32_t batchSize{1}; // Number of inputs in a batch bool int8{false}; // Allow runnning the network in Int8 mode. bool fp16{false}; // Allow running the network in FP16 mode. const char* inputTensorName = "data"; const char* outputTensorName = "prob"; int inputW; // The input width of the network. int inputH; // The input height of the the network. int outputSize; // THe output size of the network. std::string weightsFile; // Weights file filename. std::string trtEngineFile; // trt engine file name }; class InceptionV4 { public: InceptionV4(const InceptionV4Params &enginecfg); ~InceptionV4() {}; bool serializeEngine(); // create & serialize netowrk Engine bool deserializeCudaEngine(); void doInference(float* input, float* output, int batchSize); bool cleanUp(); private: bool buildEngine(IBuilder *builder, IBuilderConfig *config); // Runs the Tensorrt network inference engine on a sample. private: InceptionV4Params mParams; ICudaEngine* mEngine; // The tensorrt engine used to run the network. std::map weightMap; // The weight value map. IExecutionContext* mContext; // The TensorRT execution context to run inference. std::string inception; DataType dt{DataType::kFLOAT}; }; } #endif ================================================ FILE: inception/inceptionv4/layers_api.cpp ================================================ #include "layers_api.h" namespace trtxlayers { IScaleLayer* addBatchNorm2d( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps ) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* basicConv2d( INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, DimsHW ksize, int s, DimsHW p, std::string lname ) { // empty wts for bias Weights emptywts{DataType::kFLOAT, nullptr, 0}; // add conv -> bn -> relu IConvolutionLayer* conv = network -> addConvolutionNd(input, outch, ksize, weightMap[lname + ".conv.weight"], emptywts); assert(conv); conv -> setStrideNd(DimsHW{s, s}); conv -> setPaddingNd(p); IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv -> getOutput(0), lname + ".bn", 1e-3); IActivationLayer* relu = network -> addActivation(*bn -> getOutput(0), ActivationType::kRELU); assert(relu); return relu; } IConcatenationLayer* mixed_3a( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { // branch 0 IPoolingLayer* pool = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{3, 3}); assert(pool); pool -> setStrideNd(DimsHW{2, 2}); // branch 1 IActivationLayer* relu = basicConv2d(network, weightMap, input, 96, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".conv"); // concatenate two branches ITensor* inputTensors[] = { pool -> getOutput(0), relu -> getOutput(0) }; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 2); assert(cat); return cat; } IConcatenationLayer* mixed_4a( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { // branch 0 IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0.0"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 0, 0 }, lname + ".branch0.1"); // branch 1 IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 64, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch1.1"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 64, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch1.2"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.3"); // concatenate two branches ITensor* inputTensors[] = { relu1 -> getOutput(0), relu2 -> getOutput(0) }; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 2); assert(cat); return cat; } IConcatenationLayer* mixed_5a( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { std::cout<<"mixed_5a"< addPoolingNd(input, PoolingType::kMAX, DimsHW{ 3, 3 }); assert(pool1); pool1 -> setStrideNd(DimsHW{ 2, 2 }); // concatenate branches ITensor* inputTensors[] = { relu1 -> getOutput(0), pool1 -> getOutput(0)}; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 2); assert(cat); std::cout<<"mixed_5a done"<& weightMap, ITensor& input, std::string lname ) { // branch 0 IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 96, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0"); // branch 1 IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname +".branch1.0"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname+".branch1.1"); // branch 2 IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 64, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname+".branch2.0"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname+".branch2.1"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 96, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname+".branch2.2"); // branch 3 IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{1, 1}); pool1->setPaddingNd(DimsHW{1, 1}); pool1->setAverageCountExcludesPadding(false); IActivationLayer* relu3 = basicConv2d(network, weightMap, *pool1 -> getOutput(0), 96, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname+".branch3.1"); // concatenate all branches outputs ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), relu2 -> getOutput(0), relu3 -> getOutput(0)}; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 4); assert(cat); return cat; } IConcatenationLayer* reductionA( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { // features 10 branch 0 IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch0"); // branch 1 IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 224, DimsHW{ 3, 3 }, 1, DimsHW{ 1, 1 }, lname + ".branch1.1"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 256, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch1.2"); // branch 2 IPoolingLayer* pool1 = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{ 3, 3 }); assert(pool1); pool1 -> setStrideNd(DimsHW{ 2, 2 }); // concatenate ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), pool1 -> getOutput(0) }; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 3); assert(cat); return cat; } IConcatenationLayer* inceptionB( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { // features 11 branch 0 IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0"); // branch 1 IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 224, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch1.1"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 256, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch1.2"); // branch 2 IActivationLayer* relu2 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch2.0"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 192, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch2.1"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 224, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch2.2"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 224, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch2.3"); relu2 = basicConv2d(network, weightMap, *relu2 -> getOutput(0), 256, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch2.4"); // branch 3 IPoolingLayer* pool0 = network -> addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{ 3, 3 }); assert(pool0); pool0 -> setStrideNd(DimsHW{ 1, 1 }); pool0 -> setPaddingNd(DimsHW{ 1, 1 }); pool0 -> setAverageCountExcludesPadding(false); IActivationLayer* relu3 = basicConv2d(network, weightMap, *pool0 -> getOutput(0), 128, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch3.1"); // concatenate branches ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), relu2 -> getOutput(0), relu3 -> getOutput(0) }; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 4); assert(cat); return cat; } IConcatenationLayer* reductionB( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { // features 18 branch 0 IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 192, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0.0"); relu0 = basicConv2d(network, weightMap, *relu0 -> getOutput(0), 192, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch0.1"); // branch 1 IActivationLayer* relu1 = basicConv2d(network, weightMap, input, 256, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1.0"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 256, DimsHW{ 1, 7 }, 1, DimsHW{ 0, 3 }, lname + ".branch1.1"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 320, DimsHW{ 7, 1 }, 1, DimsHW{ 3, 0 }, lname + ".branch1.2"); relu1 = basicConv2d(network, weightMap, *relu1 -> getOutput(0), 320, DimsHW{ 3, 3 }, 2, DimsHW{ 0, 0 }, lname + ".branch1.3"); // branch 2 IPoolingLayer* pool1 = network -> addPoolingNd(input, PoolingType::kMAX, DimsHW{ 3, 3 }); assert(pool1); pool1 -> setStrideNd(DimsHW{ 2, 2 }); // concatenate ITensor* inputTensors[] = { relu0 -> getOutput(0), relu1 -> getOutput(0), pool1 -> getOutput(0) }; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 3); assert(cat); return cat; } IConcatenationLayer* inceptionC( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ) { // features 19 branch 0 IActivationLayer* relu0 = basicConv2d(network, weightMap, input, 256, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch0"); // branch 1 IActivationLayer* relu1_0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch1_0"); IActivationLayer* relu1_1a = basicConv2d(network, weightMap, *relu1_0 -> getOutput(0), 256, DimsHW{ 1, 3 }, 1, DimsHW{ 0, 1 }, lname + ".branch1_1a"); IActivationLayer* relu1_1b = basicConv2d(network, weightMap, *relu1_0 -> getOutput(0), 256, DimsHW{ 3, 1 }, 1, DimsHW{ 1, 0 }, lname + ".branch1_1b"); ITensor* inputTensors1[] = { relu1_1a -> getOutput(0), relu1_1b -> getOutput(0) }; IConcatenationLayer* cat1 = network -> addConcatenation(inputTensors1, 2); assert(cat1); // branch 2 IActivationLayer* relu2_0 = basicConv2d(network, weightMap, input, 384, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch2_0"); IActivationLayer* relu2_1 = basicConv2d(network, weightMap, *relu2_0 -> getOutput(0), 448, DimsHW{ 3, 1 }, 1, DimsHW{ 1, 0 }, lname + ".branch2_1"); IActivationLayer* relu2_2 = basicConv2d(network, weightMap, *relu2_1 -> getOutput(0), 512, DimsHW{ 1, 3 }, 1, DimsHW{ 0, 1 }, lname + ".branch2_2"); IActivationLayer* relu2_3a = basicConv2d(network, weightMap, *relu2_2 -> getOutput(0), 256, DimsHW{ 1, 3 }, 1, DimsHW{ 0, 1 }, lname + ".branch2_3a"); IActivationLayer* relu2_3b = basicConv2d(network, weightMap, *relu2_2 -> getOutput(0), 256, DimsHW{ 3, 1 }, 1, DimsHW{ 1, 0 }, lname + ".branch2_3b"); ITensor* inputTensors2[] = { relu2_3a -> getOutput(0), relu2_3b -> getOutput(0) }; IConcatenationLayer* cat2 = network -> addConcatenation(inputTensors2, 2); assert(cat2); // branch 3 IPoolingLayer* pool3 = network -> addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{ 3, 3 }); assert(pool3); pool3 -> setStrideNd(DimsHW{ 1, 1 }); pool3 -> setPaddingNd(DimsHW{ 1, 1 }); pool3 -> setAverageCountExcludesPadding(false); IActivationLayer* relu3 = basicConv2d(network, weightMap, *pool3 -> getOutput(0), 256, DimsHW{ 1, 1 }, 1, DimsHW{ 0, 0 }, lname + ".branch3.1"); // concatenate ITensor* inputTensors[] = { relu0 -> getOutput(0), cat1 -> getOutput(0), cat2 -> getOutput(0), relu3 -> getOutput(0) }; IConcatenationLayer* cat = network -> addConcatenation(inputTensors, 4); assert(cat); return cat; } } ================================================ FILE: inception/inceptionv4/layers_api.h ================================================ #ifndef TRTX_LAYERS_API_H #define TRTX_LAYERS_API_H #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" using namespace nvinfer1; namespace trtxlayers { // Declare your layers here IScaleLayer* addBatchNorm2d( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps ); IActivationLayer* basicConv2d( INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, DimsHW ksize, int s, DimsHW p, std::string lname ); IConcatenationLayer* mixed_3a( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* mixed_4a( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* mixed_5a( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* inceptionA( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* reductionA( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* inceptionB( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* reductionB( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); IConcatenationLayer* inceptionC( INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname ); } #endif // TRTX_LAYERS_API_H ================================================ FILE: inception/inceptionv4/logging.h ================================================ /* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) , mPrefix(other.mPrefix) , mShouldLog(other.mShouldLog) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) { ss << " "; } ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR //! ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: inception/inceptionv4/main.cpp ================================================ #include "inception_v4.h" /** * Initializes Inception class params in the * InceptionV4Params structure. **/ trtx::InceptionV4Params initializeParams() { trtx::InceptionV4Params params; params.batchSize = 1; params.fp16 = false; params.inputH = 299; params.inputW = 299; params.outputSize = 1000; // change weights file name here params.weightsFile = "../inceptionV4.wts"; // change engine file name here params.trtEngineFile = "inceptionV4.engine"; return params; } int main(int argc, char** argv){ if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./inception -s // serialize model to plan file" << std::endl; std::cerr << "./inception -d // deserialize plan file and run inference" << std::endl; return -1; } trtx::InceptionV4Params params = initializeParams(); trtx::InceptionV4 inceptionV4(params); if (std::string(argv[1]) == "-s") { // check if engine exists already std::ifstream f(params.trtEngineFile, std::ios::binary); // if engine does not exists build, serialize and save if(!f.good()) { std::cout << "Building network ..." << std::endl; f.close(); inceptionV4.serializeEngine(); } return 1; } else if(std::string(argv[1]) == "-d") { // deserialize inceptionV4.deserializeCudaEngine(); } // create data float data[3 * params.inputH * params.inputW]; for(int i=0; i<3*params.inputH*params.inputW; i++) { data[i] = 1.0; } // run inference float prob[params.outputSize]; for(int i=0; i<100; i++) { auto start = std::chrono::system_clock::now(); inceptionV4.doInference(data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // cleanup bool cleaned = inceptionV4.cleanUp(); std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < params.outputSize; i++) { std::cout << prob[i] << ", "; if (i % 10 == 0) std::cout << i / 10 << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: inception/inceptionv4/utils.cpp ================================================ # include "utils.h" // Load weights from files. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } ================================================ FILE: inception/inceptionv4/utils.h ================================================ # ifndef TRTX_UTILS_H # define TRTX_UTILS_H #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "assert.h" #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK using namespace nvinfer1; std::map loadWeights(const std::string input); #endif // TRTX_UTILS_H ================================================ FILE: lenet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.17.0) project( lenet VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90 100 120) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} lenet.cpp) target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR} ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart TensorRT::TensorRT ${OpenCV_LIBS}) if(WIN32) set_target_properties( ${PROJECT_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() ================================================ FILE: lenet/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: lenet/README.md ================================================ # lenet5 lenet5 is one of the simplest net in this repo. You can learn the basic procedures of building CNN from TensorRT API. This demo includes 2 major steps: 1. Build engine - define network - set input/output - serialize model to `.engine` file 2. Do inference - load and deserialize model from `.engine` file - run inference ## Usage 1. download pt model from `https://github.com/SunnyHaze/LeNet5-MNIST-Pytorch/blob/main/model.pt` 2. run `gen_wts.py` to generate `.wts` file ```bash python3 gen_wts.py ``` output looks like: ```bash lenet out shape: torch.Size([1, 10]) lenet out: [tensor([0.0725, 0.0730, 0.1056, 0.1201, 0.1059, 0.0741, 0.1328, 0.0953, 0.1230, 0.0975])] inference result: 6 ``` 3. build C++ code ```bash cd tensorrtx/lenet cmake -S . -B build cmake --build build ``` 4. serialize wts model to engine file ```bash ./build/lenet -s ``` 5. run inference ```bash ./build/lenet -d ``` output looks like: ```bash ... Execution time: 32us 0.09727, 0.09732, 0.1005, 0.102, 0.1006, 0.09743, 0.1033, 0.09951, 0.1023, 0.09973, ==== Execution time: 33us 0.09727, 0.09732, 0.1005, 0.102, 0.1006, 0.09743, 0.1033, 0.09951, 0.1023, 0.09973, ==== prediction result: Top: 0 idx: 6, logits: 0.1033, label: 6 Top: 1 idx: 8, logits: 0.1023, label: 8 Top: 2 idx: 3, logits: 0.102, label: 3 ``` ## Tripy (New TensorRT Python Programming Model) 1. Generate `lenet5.wts` 2. Copy `lenet5.wts` into [tensorrtx/lenet](./) 3. Install Tripy: ```bash python3 -m pip install nvtripy -f https://nvidia.github.io/TensorRT-Incubator/packages.html ``` 4. Change directories: ```bash cd tensorrtx/lenet ``` 5. Compile and save the model: ```bash python3 lenet_tripy.py -s ``` 6. Load and run the model: ```bash python3 lenet_tripy.py -d ``` ================================================ FILE: lenet/gen_wts.py ================================================ import struct from collections import OrderedDict import cv2 import numpy as np import torch import torch.nn as nn class LeNet(nn.Module): def __init__(self): super(LeNet, self).__init__() self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0) self.relu1 = nn.ReLU() self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0) self.relu2 = nn.ReLU() self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) self.fc1 = nn.Linear(400, 120) self.relu3 = nn.ReLU() self.fc2 = nn.Linear(120, 84) self.relu4 = nn.ReLU() self.fc3 = nn.Linear(84, 10) def forward(self, x): y = self.conv1(x) y = self.relu1(y) y = self.pool1(y) y = self.conv2(y) y = self.relu2(y) y = self.pool2(y) y = y.view(y.shape[0], -1) y = self.fc1(y) y = self.relu3(y) y = self.fc2(y) y = self.relu4(y) y = self.fc3(y) return y def reformat_state_dict(state: OrderedDict) -> OrderedDict: mapping: dict[str, str] = { "layer1.0.weight": "conv1.weight", "layer1.0.bias": "conv1.bias", "layer1.3.weight": "conv2.weight", "layer1.3.bias": "conv2.bias", "layer2.0.weight": "fc1.weight", "layer2.0.bias": "fc1.bias", "layer2.2.weight": "fc2.weight", "layer2.2.bias": "fc2.bias", "layer2.4.weight": "fc3.weight", "layer2.4.bias": "fc3.bias", } for i, j in mapping.items(): state.setdefault(j, state.pop(i)) return state def main(): model = LeNet() model.eval() with torch.inference_mode(): img = cv2.imread("../assets/6.pgm", cv2.IMREAD_GRAYSCALE) img = cv2.resize(img, (32, 32), interpolation=cv2.INTER_LINEAR) img = (((img / 255.0) - 0.1307) / 0.3081).astype(np.float32) state = torch.load("../models/model.pt", weights_only=False) state = reformat_state_dict(state["state_dict"]) model.load_state_dict(state) input = torch.from_numpy(img)[None, None, ...] out = model(input) print(f"lenet output shape: {out.shape}") print(f"lenet output: {out}") print(f"inference result for MNIST data: {int(torch.argmax(out, 1))}") # save to wts print("Writing into lenet.wts") with open("../models/lenet.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {} ".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") if __name__ == "__main__": main() ================================================ FILE: lenet/lenet.cpp ================================================ #include #include #include #include #include #include #include #include #include #include "logging.h" #include "utils.h" using M = nvinfer1::MatrixOperation; using E = nvinfer1::ElementWiseOperation; // parameters we know about the lenet-5 constexpr static const int64_t INPUT_H = 32; constexpr static const int64_t INPUT_W = 32; constexpr static const std::array NAMES = {"data", "prob"}; constexpr static const std::array SIZES = {1ll * INPUT_H * INPUT_W, 10}; constexpr static const char* WTS_PATH = "../models/lenet.wts"; constexpr static const char* ENGINE_PATH = "../models/lenet.engine"; static Logger gLogger; /** * @brief Creat the engine using only the API and not any parser. * * @param N max batch size * @param runtime runtime * @param builder builder * @param config config * @param dt data type * @return ICudaEngine* */ ICudaEngine* createLenetEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { #if TRT_VERSION >= 11200 auto flag = 1U << static_cast(NetworkDefinitionCreationFlag::kSTRONGLY_TYPED); #elif TRT_VERSION >= 10000 auto flag = 0U; #else auto flag = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); #endif auto* network = builder->createNetworkV2(flag); // Create input tensor of shape { 1, 1, 32, 32 } with name INPUT_NAME ITensor* data = network->addInput(NAMES[0], dt, Dims4{N, 1, INPUT_H, INPUT_W}); assert(data); // Add convolution layer with 6 outputs and a 5x5 filter. std::filesystem::path wts_path{WTS_PATH}; wts_path = std::filesystem::absolute(wts_path); std::map weightMap = loadWeights(wts_path.string()); auto* conv1 = network->addConvolutionNd(*data, 6, DimsHW{5, 5}, weightMap["conv1.weight"], weightMap["conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{1, 1}); conv1->setName("conv1"); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu1); relu1->setName("relu1"); // Add max pooling layer with stride of 2x2 and kernel size of 2x2. IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setName("pool1"); // Add second convolution layer with 16 outputs and a 5x5 filter. auto* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 16, DimsHW{5, 5}, weightMap["conv2.weight"], weightMap["conv2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{1, 1}); conv2->setName("conv2"); // Add activation layer using the ReLU algorithm. IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); assert(relu2); // Add second max pooling layer with stride of 2x2 and kernel size of 2x2> IPoolingLayer* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool2); pool2->setStrideNd(DimsHW{2, 2}); pool2->setName("pool2"); // Add fully connected layer auto* flatten = network->addShuffle(*pool2->getOutput(0)); flatten->setReshapeDimensions(Dims2{-1, 400}); auto* tensor_fc1w = network->addConstant(Dims2{120, 400}, weightMap["fc1.weight"])->getOutput(0); auto* fc1w = network->addMatrixMultiply(*tensor_fc1w, M::kNONE, *flatten->getOutput(0), M::kTRANSPOSE); assert(tensor_fc1w && fc1w); auto tensor_fc1b = network->addConstant(Dims2{120, 1}, weightMap["fc1.bias"])->getOutput(0); auto* fc1b = network->addElementWise(*fc1w->getOutput(0), *tensor_fc1b, E::kSUM); fc1b->setName("fc1b"); assert(tensor_fc1b && fc1b); // Add activation layer using the ReLU algorithm. IActivationLayer* relu3 = network->addActivation(*fc1b->getOutput(0), ActivationType::kRELU); assert(relu3); auto* flatten_relu3 = network->addShuffle(*relu3->getOutput(0)); flatten_relu3->setReshapeDimensions(Dims2{-1, 120}); auto* fc2w = network->addConstant(Dims2{84, 120}, weightMap["fc2.weight"])->getOutput(0); auto* fc2b = network->addConstant(Dims2{84, 1}, weightMap["fc2.bias"])->getOutput(0); auto* fc3w = network->addConstant(Dims2{10, 84}, weightMap["fc3.weight"])->getOutput(0); auto* fc3b = network->addConstant(Dims2{10, 1}, weightMap["fc3.bias"])->getOutput(0); assert(fc2w && fc2b && fc3w && fc3b); // fully connected layer with relu auto* fc2_0 = network->addMatrixMultiply(*fc2w, M::kNONE, *flatten_relu3->getOutput(0), M::kTRANSPOSE); assert(fc2_0); fc2_0->setName("fc2"); auto* fc2_1 = network->addElementWise(*fc2_0->getOutput(0), *fc2b, E::kSUM); assert(fc2_1); IActivationLayer* relu4 = network->addActivation(*fc2_1->getOutput(0), ActivationType::kRELU); assert(relu4); auto* shuffle = network->addShuffle(*relu4->getOutput(0)); shuffle->setReshapeDimensions(Dims2{-1, 84}); auto* fc3_0 = network->addMatrixMultiply(*fc3w, M::kNONE, *shuffle->getOutput(0), M::kTRANSPOSE); assert(fc3_0); auto* fc3_1 = network->addElementWise(*fc3_0->getOutput(0), *fc3b, E::kSUM); assert(fc3_1); // clang-format on // Add softmax layer to determine the probability. ISoftMaxLayer* prob = network->addSoftMax(*fc3_1->getOutput(0)); assert(prob); prob->getOutput(0)->setName(NAMES[1]); network->markOutput(*prob->getOutput(0)); #if TRT_VERSION >= 8400 config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); #else config->setMaxWorkspaceSize(WORKSPACE_SIZE); builder->setMaxBatchSize(N); #endif // Build engine #if TRT_VERSION >= 8000 IHostMemory* serialized_mem = builder->buildSerializedNetwork(*network, *config); ICudaEngine* engine = runtime->deserializeCudaEngine(serialized_mem->data(), serialized_mem->size()); delete network; #else ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } /** * @brief create a model using the API directly and serialize it to a stream * * @param N max batch size * @param runtime runtime * @param modelStream */ void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createLenetEngine(N, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } std::vector> doInference(IExecutionContext& context, void* input, int64_t batchSize) { const auto& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { std::size_t size = 0; #if TRT_VERSION >= 8000 auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batchSize * SIZES[i], std::nanf("")); std::size_t size = batchSize * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); for (auto& buffer : buffers) { CHECK(cudaFree(buffer)); } CHECK(cudaStreamDestroy(stream)); return prob; } int main(int argc, char** argv) { try { if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./lenet -s // serialize model to plan file\n"; std::cerr << "./lenet -d // deserialize plan file and run inference\n"; return -1; } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif std::cout << "serialized weights to lenet5.engine\n"; return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // prepare input/output data auto img = cv::imread("../assets/6.pgm", cv::IMREAD_GRAYSCALE); cv::resize(img, img, cv::Size(32, 32), 0, 0, cv::INTER_LINEAR); assert(img.channels() == 1); img.convertTo(img, CV_32FC1, 0.00392156f, -0.1307f); img = img / cv::Scalar(0.3081); assert(img.total() * img.elemSize() == SIZES[0] * sizeof(float)); #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); // Run inference for (int32_t i = 0; i < 100; ++i) { auto _start = std::chrono::system_clock::now(); auto prob = doInference(*context, img.data, 1); auto _end = std::chrono::system_clock::now(); auto _time = std::chrono::duration_cast(_end - _start).count(); std::cout << "Execution time: " << _time << "us\n"; for (const auto& vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 9) { std::cout << "\n====\n"; break; } } } if (i == 99) { std::cout << "prediction result:\n"; int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << idx << "\n"; } } } #if TRT_VERSION >= 8000 delete context; delete engine; delete runtime; #else context->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } catch (const std::exception& err) { std::cerr << "fatal error: " << err.what() << '\n'; return -1; } catch (...) { std::cerr << "fatal error: unknown exception\n"; return -1; } } ================================================ FILE: lenet/lenet.py ================================================ import argparse import os import struct import sys import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt INPUT_H = 32 INPUT_W = 32 OUTPUT_SIZE = 10 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" weight_path = "./lenet5.wts" engine_path = "./lenet5.engine" gLogger = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def createLenetEngine(maxBatchSize, builder, config, dt): weight_map = load_weights(weight_path) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (1, INPUT_H, INPUT_W)) assert data conv1 = network.add_convolution(input=data, num_output_maps=6, kernel_shape=(5, 5), kernel=weight_map["conv1.weight"], bias=weight_map["conv1.bias"]) assert conv1 conv1.stride = (1, 1) relu1 = network.add_activation(conv1.get_output(0), type=trt.ActivationType.RELU) assert relu1 pool1 = network.add_pooling(input=relu1.get_output(0), window_size=trt.DimsHW(2, 2), type=trt.PoolingType.AVERAGE) assert pool1 pool1.stride = (2, 2) conv2 = network.add_convolution(pool1.get_output(0), 16, trt.DimsHW(5, 5), weight_map["conv2.weight"], weight_map["conv2.bias"]) assert conv2 conv2.stride = (1, 1) relu2 = network.add_activation(conv2.get_output(0), type=trt.ActivationType.RELU) assert relu2 pool2 = network.add_pooling(input=relu2.get_output(0), window_size=trt.DimsHW(2, 2), type=trt.PoolingType.AVERAGE) assert pool2 pool2.stride = (2, 2) fc1 = network.add_fully_connected(input=pool2.get_output(0), num_outputs=120, kernel=weight_map['fc1.weight'], bias=weight_map['fc1.bias']) assert fc1 relu3 = network.add_activation(fc1.get_output(0), type=trt.ActivationType.RELU) assert relu3 fc2 = network.add_fully_connected(input=relu3.get_output(0), num_outputs=84, kernel=weight_map['fc2.weight'], bias=weight_map['fc2.bias']) assert fc2 relu4 = network.add_activation(fc2.get_output(0), type=trt.ActivationType.RELU) assert relu4 fc3 = network.add_fully_connected(input=relu4.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map['fc3.weight'], bias=weight_map['fc3.bias']) assert fc3 prob = network.add_softmax(fc3.get_output(0)) assert prob prob.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(prob.get_output(0)) # Build engine builder.max_batch_size = maxBatchSize config.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network del weight_map return engine def APIToModel(maxBatchSize): builder = trt.Builder(gLogger) config = builder.create_builder_config() engine = createLenetEngine(maxBatchSize, builder, config, trt.float32) assert engine with open(engine_path, "wb") as f: f.write(engine.serialize()) del engine del builder def doInference(context, host_in, host_out, batchSize): engine = context.engine assert engine.num_bindings == 2 devide_in = cuda.mem_alloc(host_in.nbytes) devide_out = cuda.mem_alloc(host_out.nbytes) bindings = [int(devide_in), int(devide_out)] stream = cuda.Stream() cuda.memcpy_htod_async(devide_in, host_in, stream) context.execute_async(bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_out, devide_out, stream) stream.synchronize() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') args = parser.parse_args() if not (args.s ^ args.d): print("arguments not right!") print("python lenet.py -s # serialize model to plan file") print("python lenet.py -d # deserialize plan file and run inference") sys.exit() if args.s: APIToModel(1) else: runtime = trt.Runtime(gLogger) assert runtime with open(engine_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((INPUT_H * INPUT_W), dtype=np.float32) host_in = cuda.pagelocked_empty(INPUT_H * INPUT_W, dtype=np.float32) np.copyto(host_in, data.ravel()) host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32) doInference(context, host_in, host_out, 1) print(f'Output: {host_out}') ================================================ FILE: lenet/lenet_tripy.py ================================================ import argparse import os import struct import nvtripy as tp INPUT_SHAPE = (1, 1, 32, 32) WEIGHT_PATH = "lenet5.wts" COMPILED_MODEL_PATH = "lenet5.tpymodel" def load_weights(file): if not os.path.exists(file): raise FileNotFoundError(f"Weight file: {file} does not exist.") with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1, "Mismatch in weight count." return { splits[0]: tp.Tensor([struct.unpack(">f", bytes.fromhex(hex_val))[0] for hex_val in splits[2:]]) for splits in (line.split(" ") for line in lines[1:]) } class Lenet5(tp.Module): def __init__(self): super().__init__() self.conv1 = tp.Conv(1, 6, kernel_dims=(5, 5)) self.conv2 = tp.Conv(6, 16, kernel_dims=(5, 5)) self.fc1 = tp.Linear(16 * 5 * 5, 120) self.fc2 = tp.Linear(120, 84) self.fc3 = tp.Linear(84, 10) def forward(self, x): x = tp.relu(self.conv1(x)) x = tp.avgpool(x, kernel_dims=(2, 2), stride=(2, 2)) x = tp.relu(self.conv2(x)) x = tp.avgpool(x, kernel_dims=(2, 2), stride=(2, 2)) x = tp.flatten(x, 1) x = tp.relu(self.fc1(x)) x = tp.relu(self.fc2(x)) x = tp.softmax(self.fc3(x), dim=1) return x def main(): parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=True) group.add_argument("-s", action="store_true", help="Save the model") group.add_argument("-d", action="store_true", help="Load a saved model") args = parser.parse_args() if args.s: model = Lenet5() weights = load_weights(WEIGHT_PATH) # The weights in the weights file are flattened, so we need to reshape # them to the right shape before we can load them: for name, tensor in model.state_dict().items(): weights[name] = tp.reshape(weights[name], tensor.shape) model.load_state_dict(weights) compiled_model = tp.compile(model, args=[tp.InputInfo(INPUT_SHAPE, dtype=tp.float32)]) compiled_model.save(COMPILED_MODEL_PATH) else: compiled_model = tp.Executable.load(COMPILED_MODEL_PATH) data = tp.ones(INPUT_SHAPE, dtype=tp.float32).eval() output = compiled_model(data) print(f"Output: {output}") if __name__ == "__main__": main() ================================================ FILE: lenet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: lenet/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: lenet/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; enum : std::uint32_t { WORKSPACE_SIZE = 16 << 20 }; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static auto loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } static std::vector> topk(const std::vector& v, int64_t k) { if (k <= 0) return {}; auto s = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), std::next(idx.begin(), s), idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(k); for (int i = 0; i < k; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: lprnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.17.0) project( lprnet VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 60 70 72 75 80 86 89) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cpp) target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart TensorRT::TensorRT ${OpenCV_LIBS}) if(WIN32) set_target_properties( ${PROJECT_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() target_compile_options(${PROJECT_NAME} PRIVATE $<$:/utf-8>) ================================================ FILE: lprnet/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: lprnet/README.md ================================================ # LPRNet The Pytorch implementation is [xuexingyu24/License_Plate_Detection_Pytorch](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch). ## Usage 1. download model from [HERE](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch/blob/master/LPRNet/weights/Final_LPRNet_model.pth) and put it into `models` folder 2. use `genwts.py` to generate wts file ```bash python3 genwts.py ``` 3. build C++ code ```bash pushd tensorrtx/lprnet cmake -S . -B build -G Ninja --fresh cmake --build build ``` 4. serialize wts model to engine file ```bash ./build/LPRnet -s ``` now you may see `LPRNet.engine` under `models` 5. run inference sample code use the image under assets by default: ![sample](../assets/car_plate.jpg) ```bash ./build/LPRnet -d ``` output looks like: ```bash ... Execution time: 205us -65.58, -28.74, -52.1, -70.79, -53.36, -57.58, -70.97, -60.66, -48.18, -57.38, -54.07, -58.56, -49.04, -52.39, -51.94, -53.4, -49.04, -45.89, -49.42, -7.863, -42.12, ==== Execution time: 202us -65.58, -28.74, -52.1, -70.79, -53.36, -57.58, -70.97, -60.66, -48.18, -57.38, -54.07, -58.56, -49.04, -52.39, -51.94, -53.4, -49.04, -45.89, -49.42, -7.863, -42.12, ==== result: 沪BKB770 ``` ## Note if you are running this demo on windows, you may need to check the code page, e.g., for Windows PowerShell, run: ```ps1 chcp ``` if the output is not **65001**, then use ```ps1 chcp 65001 ``` to set the code page to utf-8, so you can get the correct literal result. ================================================ FILE: lprnet/gen_wts.py ================================================ """ model codes are borrowed from: `https://github.com/xuexingyu24/License_Plate_Detection_Pytorch/blob/master/LPRNet/model/LPRNET.py` check `.pth` model here: `https://github.com/xuexingyu24/License_Plate_Detection_Pytorch/blob/master/LPRNet/weights/Final_LPRNet_model.pth` """ import struct import cv2 import numpy as np import torch import torch.nn as nn CHARS = "京沪津渝冀晋蒙辽吉黑苏浙皖闽赣鲁豫鄂湘粤桂琼川贵云藏陕甘青宁新0123456789ABCDEFGHJKLMNPQRSTUVWXYZIO-" def preprocess(path): image = cv2.imread(path, cv2.IMREAD_COLOR) image = cv2.resize(image, (94, 24), interpolation=cv2.INTER_CUBIC) image = image.astype(np.float32) image = image / 255.0 - 0.5 # still HxWx3, BGR image = image.transpose(2, 0, 1)[None, ...] image = torch.from_numpy(image) return image class small_basic_block(nn.Module): def __init__(self, ch_in, ch_out): super(small_basic_block, self).__init__() self.block = nn.Sequential( nn.Conv2d(ch_in, ch_out // 4, kernel_size=1), nn.ReLU(), nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(3, 1), padding=(1, 0)), nn.ReLU(), nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(1, 3), padding=(0, 1)), nn.ReLU(), nn.Conv2d(ch_out // 4, ch_out, kernel_size=1), ) def forward(self, x): return self.block(x) class LPRNet(nn.Module): def __init__(self, class_num, dropout_rate): super(LPRNet, self).__init__() self.class_num = class_num self.backbone = nn.Sequential( nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1), # 0 nn.BatchNorm2d(num_features=64), nn.ReLU(), # 2 nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)), small_basic_block(ch_in=64, ch_out=128), # 4 nn.BatchNorm2d(num_features=128), nn.ReLU(), # 6 nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)), small_basic_block(ch_in=64, ch_out=256), # 8 nn.BatchNorm2d(num_features=256), nn.ReLU(), # 10 small_basic_block(ch_in=256, ch_out=256), # 11 nn.BatchNorm2d(num_features=256), # 12 nn.ReLU(), # 13 nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)), # 14 nn.Dropout(dropout_rate), nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1), # 16 nn.BatchNorm2d(num_features=256), nn.ReLU(), # 18 nn.Dropout(dropout_rate), nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1), # 20 nn.BatchNorm2d(num_features=class_num), nn.ReLU(), # 22 ) self.container = nn.Sequential( nn.Conv2d( in_channels=256 + class_num + 128 + 64, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1) ) ) def forward(self, x): keep_features = list() for i, layer in enumerate(self.backbone.children()): x = layer(x) if i in [2, 6, 13, 22]: # [2, 4, 8, 11, 22] print(self.backbone[i]) keep_features.append(x) global_context = list() for i, f in enumerate(keep_features): if i in [0, 1]: f = nn.AvgPool2d(kernel_size=5, stride=5)(f) if i in [2]: f = nn.AvgPool2d(kernel_size=(4, 10), stride=(4, 2))(f) f_pow = torch.pow(f, 2) f_mean = torch.mean(f_pow) f = torch.div(f, f_mean) global_context.append(f) x = torch.cat(global_context, 1) x = self.container(x) logits = torch.mean(x, dim=2) return logits if __name__ == "__main__": model_path = "../models/Final_LPRNet_model.pth" model = LPRNet(class_num=len(CHARS), dropout_rate=0) print("loading pretrained model from %s" % model_path) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model.load_state_dict(torch.load(model_path, map_location=device)) img = preprocess("../assets/car_plate.jpg") model.eval() print(model) with torch.inference_mode(): preds = model(img) res = "".join(CHARS[i] for i in torch.argmax(preds[0], dim=0).tolist()) res = res.replace("-", "") with open("../models/LPRNet.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): print("key: ", k) print("value: ", v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") print(f"inference result: {res}") ================================================ FILE: lprnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: lprnet/lprnet.cpp ================================================ #include #include #include #include #include #include #include #include #include #include #include #include "logging.h" #include "utils.h" #ifdef _WIN32 #define NOMINMAX #include #endif using namespace nvinfer1; using WeightMap = std::map; using NDCF = nvinfer1::NetworkDefinitionCreationFlag; static Logger gLogger; static constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; static constexpr const int32_t DEVICE = 0; static constexpr const int32_t BATCH_SIZE = 1; static constexpr const char* WTS_PATH = "../models/LPRNet.wts"; static constexpr const char* ENGINE_PATH = "../models/LPRNet.engine"; // stuff we know about the network and the input/output blobs static constexpr const int32_t INPUT_H = 24; static constexpr const int32_t INPUT_W = 94; static constexpr const std::array NAMES = {"data", "prob"}; static constexpr const std::array SIZES = {3 * INPUT_H * INPUT_W, 18 * 68}; static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false; static constexpr const std::array mean = {0.5f, 0.5f, 0.5f}; static constexpr const std::array stdv = {1.f, 1.f, 1.f}; const std::array alphabet = { "京", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "皖", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "I", "O", "-"}; IScaleLayer* addBatchNorm2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname, float eps = 1e-5) { const float* gamma = reinterpret_cast(weightMap[lname + ".weight"].values); const float* beta = reinterpret_cast(weightMap[lname + ".bias"].values); const float* mean = reinterpret_cast(weightMap[lname + ".running_mean"].values); const float* var = reinterpret_cast(weightMap[lname + ".running_var"].values); int64_t len = weightMap[lname + ".running_var"].count; auto* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; auto* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; auto* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0f; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); scale_1->setName(lname.c_str()); return scale_1; } IConvolutionLayer* smallBasicBlock(INetworkDefinition* network, WeightMap& w, ITensor& input, int ch_out, const std::string& lname) { int o = ch_out / 4, i = 0; ITensor* cur_input = &input; IConvolutionLayer* ret{nullptr}; struct ConvParams { DimsHW k_dim, p_dim; int ch_out; std::string w_name, b_name; }; const std::array conv_params = {{ {DimsHW{1, 1}, DimsHW{0, 0}, o, lname + ".block.0.weight", lname + ".block.0.bias"}, {DimsHW{3, 1}, DimsHW{1, 0}, o, lname + ".block.2.weight", lname + ".block.2.bias"}, {DimsHW{1, 3}, DimsHW{0, 1}, o, lname + ".block.4.weight", lname + ".block.4.bias"}, {DimsHW{1, 1}, DimsHW{0, 0}, ch_out, lname + ".block.6.weight", lname + ".block.6.bias"}, }}; for (const auto& param : conv_params) { ret = network->addConvolutionNd(*cur_input, param.ch_out, param.k_dim, w[param.w_name], w[param.b_name]); assert(ret); ret->setPaddingNd(param.p_dim); ret->setName((lname + ".block." + std::to_string(i++)).c_str()); if (i != 4) { auto* relu = network->addActivation(*ret->getOutput(0), ActivationType::kRELU); assert(relu); cur_input = relu->getOutput(0); } else { cur_input = ret->getOutput(0); } } return ret; } ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { const int nc = 68; WeightMap w = loadWeights(WTS_PATH); #if TRT_VERSION >= 11200 auto flag = 1U << static_cast(NDCF::kSTRONGLY_TYPED); #elif TRT_VERSION >= 10000 auto flag = 0U; #else auto flag = 1U << static_cast(NDCF::kEXPLICIT_BATCH); #endif auto* network = builder->createNetworkV2(flag); ITensor* data{nullptr}; if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side dt = DataType::kUINT8; auto* input = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3}); auto* trans = addTransformLayer(network, *input, false, mean, stdv); data = trans->getOutput(0); } else { data = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W}); } assert(data); // CBR (Conv-BatchNorm-ReLU) auto* c0 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, w["backbone.0.weight"], w["backbone.0.bias"]); auto* bn0 = addBatchNorm2d(network, w, *c0->getOutput(0), "backbone.1"); auto* relu0 = network->addActivation(*bn0->getOutput(0), ActivationType::kRELU); auto* f0 = network->addPoolingNd(*relu0->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); f0->setStrideNd(DimsHW{1, 1}); assert(c0 && bn0 && relu0); auto* sm0 = smallBasicBlock(network, w, *f0->getOutput(0), 128, "backbone.4"); auto* bn1 = addBatchNorm2d(network, w, *sm0->getOutput(0), "backbone.5"); auto* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(sm0 && bn1 && relu1); // need to unsqueeze to 5D tensor for 3D pooling auto* to5d0 = network->addShuffle(*relu1->getOutput(0)); to5d0->setReshapeDimensions({5, {BATCH_SIZE, 1, 128, 20, 90}}); auto* f1 = network->addPoolingNd(*to5d0->getOutput(0), PoolingType::kMAX, Dims3{1, 3, 3}); f1->setStrideNd(Dims3{2, 1, 2}); f1->setName("MaxPool3d_1"); auto* to5d1 = network->addShuffle(*f1->getOutput(0)); to5d1->setReshapeDimensions(Dims4{BATCH_SIZE, 64, 18, 44}); auto* sm1 = smallBasicBlock(network, w, *to5d1->getOutput(0), 256, "backbone.8"); auto* bn2 = addBatchNorm2d(network, w, *sm1->getOutput(0), "backbone.9"); auto* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); auto* sm2 = smallBasicBlock(network, w, *relu2->getOutput(0), 256, "backbone.11"); auto* bn3 = addBatchNorm2d(network, w, *sm2->getOutput(0), "backbone.12"); auto* relu3 = network->addActivation(*bn3->getOutput(0), ActivationType::kRELU); // need to unsqueeze to 5D tensor for 3D pooling auto* to5d2 = network->addShuffle(*relu3->getOutput(0)); to5d2->setReshapeDimensions({5, {BATCH_SIZE, 1, 256, 18, 44}}); auto* f2 = network->addPoolingNd(*to5d2->getOutput(0), PoolingType::kMAX, Dims3{1, 3, 3}); f2->setStrideNd(Dims3{4, 1, 2}); f2->setName("MaxPool3d_2"); auto* to5d3 = network->addShuffle(*f2->getOutput(0)); to5d3->setReshapeDimensions(Dims4{BATCH_SIZE, 64, 16, 21}); // CBR (Conv-BatchNorm-ReLU) c0 = network->addConvolutionNd(*to5d3->getOutput(0), 256, DimsHW{1, 4}, w["backbone.16.weight"], w["backbone.16.bias"]); auto* bn4 = addBatchNorm2d(network, w, *c0->getOutput(0), "backbone.17"); auto* relu5 = network->addActivation(*bn4->getOutput(0), ActivationType::kRELU); // CBR (Conv-BatchNorm-ReLU) c0 = network->addConvolutionNd(*relu5->getOutput(0), nc, DimsHW{13, 1}, w["backbone.20.weight"], w["backbone.20.bias"]); auto* bn5 = addBatchNorm2d(network, w, *c0->getOutput(0), "backbone.21"); auto* backbone = network->addActivation(*bn5->getOutput(0), ActivationType::kRELU); auto makeGlobalContext = [&](ITensor* feat, bool pool5, bool pool4x10) -> ITensor* { static int j = 0; ITensor* t = feat; if (pool5) { auto* pool = network->addPoolingNd(*t, PoolingType::kAVERAGE, DimsHW{5, 5}); assert(pool); pool->setStrideNd(DimsHW{5, 5}); auto _name = "global5." + std::to_string(j); pool->setName(_name.c_str()); t = pool->getOutput(0); } if (pool4x10) { auto* pool = network->addPoolingNd(*t, PoolingType::kAVERAGE, DimsHW{4, 10}); assert(pool); pool->setStrideNd(DimsHW{4, 2}); auto _name = "global4x10." + std::to_string(j); pool->setName(_name.c_str()); t = pool->getOutput(0); } // pow Dims dims = t->getDimensions(); int64_t size = dims.d[0] * dims.d[1] * dims.d[2] * dims.d[3]; void* data = malloc(sizeof(float) * size); for (int i = 0; i < size; ++i) { reinterpret_cast(data)[i] = 2.0f; } auto name = "pow." + std::to_string(j); w[name] = {DataType::kFLOAT, data, size}; auto* pow_const = network->addConstant(dims, w[name]); auto* pow = network->addElementWise(*t, *pow_const->getOutput(0), ElementWiseOperation::kPOW); assert(pow); pow->setName(name.c_str()); // mean int32_t mask = (1 << dims.nbDims) - 1; auto* mean = network->addReduce(*pow->getOutput(0), ReduceOperation::kAVG, mask, true); auto _mean_name = "mean." + std::to_string(j); mean->setName(_mean_name.c_str()); // div auto* div = network->addElementWise(*t, *mean->getOutput(0), ElementWiseOperation::kDIV); auto _div_name = "div." + std::to_string(j); div->setName(_div_name.c_str()); ++j; return div->getOutput(0); }; auto* gc0 = makeGlobalContext(relu0->getOutput(0), true, false); auto* gc1 = makeGlobalContext(relu1->getOutput(0), true, false); auto* gc2 = makeGlobalContext(relu3->getOutput(0), false, true); auto* gc3 = makeGlobalContext(backbone->getOutput(0), false, false); const std::array gcs = {gc0, gc1, gc2, gc3}; auto* cat = network->addConcatenation(gcs.data(), 4); assert(cat); cat->setAxis(1); auto* c = network->addConvolutionNd(*cat->getOutput(0), nc, DimsHW{1, 1}, w["container.0.weight"], w["container.0.bias"]); auto* logits = network->addReduce(*c->getOutput(0), ReduceOperation::kAVG, 0x04, false); logits->getOutput(0)->setName(NAMES[1]); network->markOutput(*logits->getOutput(0)); #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); IHostMemory* mem = builder->buildSerializedNetwork(*network, *config); ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size()); delete network; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif std::cout << "build finished\n"; // Release host memory for (auto& mem : w) { free((void*)mem.second.values); } return engine; } void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } auto doInference(IExecutionContext& context, void* input, int64_t batchSize) -> std::vector> { const auto& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { std::size_t size = 0; #if TRT_VERSION >= 8000 auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batchSize * SIZES[i], std::nanf("")); std::size_t size = batchSize * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); for (auto& buffer : buffers) { CHECK(cudaFree(buffer)); } CHECK(cudaStreamDestroy(stream)); return prob; } int main(int argc, char** argv) { #if _WIN32 SetConsoleOutputCP(CP_UTF8); #endif cudaSetDevice(DEVICE); checkTrtEnv(DEVICE); if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./LPRnet -s // serialize model to plan file\n"; std::cerr << "./LPRnet -d // deserialize plan file and run inference\n"; return -1; } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return 1; } void* input = nullptr; std::vector data; cv::Mat img = cv::imread("../assets/car_plate.jpg"); if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_CUBIC); input = static_cast(img.data); } else { data = preprocess_img(img, false, mean, stdv, BATCH_SIZE, INPUT_H, INPUT_W); input = data.data(); } #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); for (int32_t i = 0; i < 100; ++i) { auto _start = std::chrono::system_clock::now(); auto prob = doInference(*context, input, 1); auto _end = std::chrono::system_clock::now(); auto _time = std::chrono::duration_cast(_end - _start).count(); std::cout << "Execution time: " << _time << "us\n"; for (const auto& vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====\n"; break; } } } if (i == 99) { int prev = 67; std::string str; for (int t = 0; t < 18; ++t) { std::array scores{}; for (int c = 0; c < 68; ++c) { scores[c] = prob[0][t + 18 * c]; } int best = static_cast(std::distance(scores.begin(), std::max_element(scores.begin(), scores.end()))); if (best != prev && best != 67) str += alphabet[best]; prev = best; } std::cout << "result: " << str << "\n"; } } delete[] trtModelStream; #if TRT_VERSION >= 8000 delete context; delete engine; delete runtime; #else context->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } ================================================ FILE: lprnet/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: lprnet/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static inline void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static inline auto loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean subtract mean * @param std divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in float32 type */ static inline std::vector preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array& mean, const std::array& std, int n, int h, int w) { const auto c = img.channels(); const auto size = c * h * w; if (c != 3) { std::cerr << "this demo only supports 3 channel input image.\n"; std::abort(); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); img.convertTo(img, CV_32FC3, 1.f / 255); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(static_cast(n) * c * h * w, 0.f); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = v[0]; chw[i * size + 1 * h * w + y * w + x] = v[1]; chw[i * size + 2 * h * w + y * w + x] = v[2]; } } } return chw; } static inline std::vector> topk(const std::vector& v, int64_t k) { if (k <= 0) return {}; auto s = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), std::next(idx.begin(), s), idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(k); for (int i = 0; i < k; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static inline std::map loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } static inline ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const std::array& mean, const std::array& std) { struct ScaleParams { std::array shift; std::array scale; }; static std::vector> gScaleParams; auto params = std::make_unique(); params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]}; params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)}; static const Weights empty{DataType::kFLOAT, nullptr, 0ll}; const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll}; const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll}; gScaleParams.emplace_back(std::move(params)); ITensor* in = &input; if (input.getType() != DataType::kFLOAT) { #if TRT_VERSION >= 8000 auto* cast = network->addCast(input, DataType::kFLOAT); assert(cast); cast->setName("Cast to FP32"); in = cast->getOutput(0); #else auto* identity = network->addIdentity(input); assert(identity); identity->setName("Convert to FP32"); identity->setOutputType(0, DataType::kFLOAT); in = identity->getOutput(0); #endif } // Convert from NHWC to NCHW auto* perm = network->addShuffle(*in); assert(perm); perm->setName("NHWC -> NCHW"); perm->setFirstTranspose(Permutation{0, 3, 1, 2}); // Convert from BGR to RGB (optional) ITensor* data{nullptr}; if (bgr2rgb) { auto add_slice = [&](int c, const char* name) -> ITensor* { auto dims = perm->getOutput(0)->getDimensions(); Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1}; Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]}; auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride); _slice->setName(name); assert(_slice && _slice->getNbOutputs() == 1); return _slice->getOutput(0); }; std::array channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")}; auto* cat = network->addConcatenation(channels.data(), 3); assert(cat); cat->setName("RGB"); cat->setAxis(1); data = cat->getOutput(0); } else { data = perm->getOutput(0); } // Normalize auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty); assert(trans); trans->setName("mean & std"); #if TRT_VERSION >= 8000 trans->setChannelAxis(1); #endif return trans; } static inline size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: mlp/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.17.0) project( mlp VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 60 70 72 75 80 86 89) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} mlp.cpp) target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart TensorRT::TensorRT) ================================================ FILE: mlp/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: mlp/README.md ================================================ # mlp MLP is the most basic net in this tensorrtx project for starters. You can learn the basic procedures of building TensorRT app from the provided APIs. The process of building a TensorRT engine explained in the chart below. ![TensorRT Image](https://user-images.githubusercontent.com/33795294/148565279-795b12da-5243-4e7e-881b-263eb7658683.jpg) This demo creates a single-layer MLP with `TensorRT >= 7.2.x` version support. ## Helper Files `logging.h` : A logger file for using NVIDIA TensorRT API (mostly same for all models) `mlp.wts` : Converted weight file, can be generated from [pytorchx/mlp](https://github.com/wang-xinyu/pytorchx/tree/master/mlp), for mlp, it looks like: ```bash 2 linear.weight 1 3fff7e32 linear.bias 1 3c138a5a ``` (you can create `mlp.wts` and copy this content into it directly) ## TensorRT C++ API see [HERE](../README.md#how-to-run) ## TensorRT Python API 1. Generate mlp.wts (from `pytorchx` or create on your own) 2. Put mlp.wts into tensorrtx/mlp (if using the generated weights) 3. Run ```bash cd tensorrtx/mlp python mlp.py -s # serialize model to plan file, i.e. 'mlp.engine' python mlp.py -d # deserialize plan file and run inference ``` ================================================ FILE: mlp/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: mlp/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: mlp/mlp.cpp ================================================ #include #include #include #include #include #include "logging.h" #include "utils.h" using namespace nvinfer1; constexpr static const int64_t INPUT_SIZE = 1; constexpr static const int64_t OUTPUT_SIZE = 1; constexpr static const char* INPUT_NAME = "data"; constexpr static const char* OUTPUT_NAME = "out"; constexpr static const char* WTS_PATH = "../models/mlp.wts"; constexpr static const char* ENGINE_PATH = "../models/mlp.engine"; // Logger from TRT API static Logger gLogger; /** * Create a single-layer "MLP" using the TRT Builder and Configurations * * @param N: max batch size for built TRT model * @param builder: to build engine and networks * @param config: configuration related to Hardware * @param dt: datatype for model layers * @return engine: TRT model */ ICudaEngine* createMLPEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { std::cout << "[INFO]: Creating MLP using TensorRT...\n"; // Load Weights from relevant file std::map weightMap = loadWeights(WTS_PATH); // Create an empty network #if TRT_VERSION >= 10000 auto* network = builder->createNetworkV2(0); #else auto* network = builder->createNetworkV2(1u << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); #endif // Create an input with proper name ITensor* data = network->addInput(INPUT_NAME, dt, Dims4{N, 1, 1, 1}); assert(data); // all tensors auto* fc1w = network->addConstant(Dims4{1, 1, 1, 1}, weightMap["linear.weight"])->getOutput(0); auto* fc1b = network->addConstant(Dims4{1, 1, 1, 1}, weightMap["linear.bias"])->getOutput(0); assert(fc1w && fc1b); // fc layer auto* fc1_0 = network->addMatrixMultiply(*data, MatrixOperation::kNONE, *fc1w, MatrixOperation::kTRANSPOSE); auto* fc1_1 = network->addElementWise(*fc1_0->getOutput(0), *fc1b, ElementWiseOperation::kSUM); assert(fc1_0 && fc1_1); fc1_0->setName("fc1_0"); // set output with name auto* output = fc1_1->getOutput(0); output->setName(OUTPUT_NAME); // mark the output network->markOutput(*output); #if TRT_VERSION >= 8000 IHostMemory* serialized_mem = builder->buildSerializedNetwork(*network, *config); ICudaEngine* engine = runtime->deserializeCudaEngine(serialized_mem->data(), serialized_mem->size()); delete network; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif assert(engine != nullptr); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(int32_t maxBatchSize, IRuntime* runtime, IHostMemory** modelStream) { /** * Create engine using TensorRT APIs * * @param maxBatchSize: for the deployed model configs * @param modelStream: shared memory to store serialized model */ // Create builder with the logger IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Build an engine ICudaEngine* engine = createMLPEngine(maxBatchSize, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // serialize the engine into binary stream (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } void doInference(IExecutionContext& ctx, void* input, float* output, int64_t batchSize = 1) { /** * Perform inference using the CUDA ctx * * @param ctx: context created by engine * @param input: input from the host * @param output: output to save on host * @param batchSize: batch size for TRT model */ // Get engine from the ctx const ICudaEngine& engine = ctx.getEngine(); #if TRT_VERSION >= 8000 int32_t nIO = engine.getNbIOTensors(); const int inputIndex = 0; const int outputIndex = engine.getNbIOTensors() - 1; #else int32_t nIO = engine.getNbBindings(); const int inputIndex = engine.getBindingIndex(INPUT_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_NAME); #endif assert(nIO == 2); // mlp contains 1 input and 1 output // create cuda stream for aync cuda operations cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // create GPU buffers on cuda device and copy input data from host std::vector buffers(nIO, nullptr); size_t inputSize = 0; size_t outputSize = batchSize * OUTPUT_SIZE * sizeof(float); #if TRT_VERSION >= 8000 auto* input_name = engine.getIOTensorName(inputIndex); inputSize = batchSize * INPUT_SIZE * getSize(engine.getTensorDataType(input_name)); #else inputSize = batchSize * INPUT_SIZE * getSize(engine.getBindingDataType(inputIndex)); #endif CHECK(cudaMalloc(&buffers[inputIndex], inputSize)); CHECK(cudaMalloc(&buffers[outputIndex], outputSize)); CHECK(cudaMemcpyAsync(buffers[inputIndex], input, inputSize, cudaMemcpyHostToDevice, stream)); // execute inference using ctx provided by engine #if TRT_VERSION >= 8000 for (int32_t i = 0; i < engine.getNbIOTensors(); i++) { auto const name = engine.getIOTensorName(i); auto dims = ctx.getTensorShape(name); auto total = std::accumulate(dims.d, dims.d + dims.nbDims, 1ll, std::multiplies<>()); std::cout << name << "\t" << total << "\n"; ctx.setTensorAddress(name, buffers[i]); } assert(ctx.enqueueV3(stream)); #else assert(ctx.enqueueV2(buffers.data(), stream, nullptr)); #endif CHECK(cudaMemcpyAsync(output, buffers[outputIndex], outputSize, cudaMemcpyDeviceToHost, stream)); CHECK(cudaStreamSynchronize(stream)); for (auto& buffer : buffers) { CHECK(cudaFree(buffer)); } CHECK(cudaStreamDestroy(stream)); } int main(int argc, char** argv) { checkTrtEnv(); if (argc != 2) { std::cerr << "[ERROR]: Arguments not right!\n"; std::cerr << "./mlp -s // serialize model to plan file\n"; std::cerr << "./mlp -d // deserialize plan file and run inference\n"; return 1; } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p.good()) { std::cerr << "could not open plan output file\n"; return 1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif std::cout << "[INFO]: Successfully created TensorRT engine.\n"; return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); delete[] trtModelStream; IExecutionContext* ctx = engine->createExecutionContext(); assert(ctx != nullptr); std::array output = {-1.f}; std::array input = {12.0f}; for (int i = 0; i < 100; i++) { auto start = std::chrono::high_resolution_clock::now(); doInference(*ctx, input.data(), output.data()); auto end = std::chrono::high_resolution_clock::now(); auto time = std::chrono::duration_cast(end - start).count(); std::cout << "Execution time: " << time << "us\n" << "output: " << output[0] << "\n"; } #if TRT_VERSION >= 8000 delete ctx; delete engine; delete runtime; #else ctx->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } ================================================ FILE: mlp/mlp.py ================================================ import argparse import os import numpy as np import struct # required for the model creation import tensorrt as trt # required for the inference using TRT engine import pycuda.driver as cuda # Sizes of input and output for TensorRT model INPUT_SIZE = 1 OUTPUT_SIZE = 1 # path of .wts (weight file) and .engine (model file) WEIGHT_PATH = "./mlp.wts" ENGINE_PATH = "./mlp.engine" # input and output names are must for the TRT model INPUT_BLOB_NAME = 'data' OUTPUT_BLOB_NAME = 'out' # A logger provided by NVIDIA-TRT gLogger = trt.Logger(trt.Logger.INFO) ################################ # DEPLOYMENT RELATED ########### ################################ def load_weights(file_path): """ Parse the .wts file and store weights in dict format :param file_path: :return weight_map: dictionary containing weights and their values """ print(f"[INFO]: Loading weights: {file_path}") assert os.path.exists(file_path), '[ERROR]: Unable to load weight file.' weight_map = {} with open(file_path, "r") as f: lines = [line.strip() for line in f] # count for total # of weights count = int(lines[0]) assert count == len(lines) - 1 # Loop through counts and get the exact num of values against weights for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) # len of splits must be greater than current weight counts assert cur_count + 2 == len(splits) # loop through all weights and unpack from the hexadecimal values values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) # store in format of { 'weight.name': [weights_val0, weight_val1, ..] } weight_map[name] = np.array(values, dtype=np.float32) return weight_map def create_mlp_engine(max_batch_size, builder, config, dt): """ Create Multi-Layer Perceptron using the TRT Builder and Configurations :param max_batch_size: batch size for built TRT model :param builder: to build engine and networks :param config: configuration related to Hardware :param dt: datatype for model layers :return engine: TRT model """ print("[INFO]: Creating MLP using TensorRT...") # load weight maps from the file weight_map = load_weights(WEIGHT_PATH) # build an empty network using builder network = builder.create_network() # add an input to network using the *input-name data = network.add_input(INPUT_BLOB_NAME, dt, (1, 1, INPUT_SIZE)) assert data # add the layer with output-size (number of outputs) linear = network.add_fully_connected(input=data, num_outputs=OUTPUT_SIZE, kernel=weight_map['linear.weight'], bias=weight_map['linear.bias']) assert linear # set the name for output layer linear.get_output(0).name = OUTPUT_BLOB_NAME # mark this layer as final output layer network.mark_output(linear.get_output(0)) # set the batch size of current builder builder.max_batch_size = max_batch_size # create the engine with model and hardware configs engine = builder.build_engine(network, config) # free captured memory del network del weight_map # return engine return engine def api_to_model(max_batch_size): """ Create engine using TensorRT APIs :param max_batch_size: for the deployed model configs :return: """ # Create Builder with logger provided by TRT builder = trt.Builder(gLogger) # Create configurations from Engine Builder config = builder.create_builder_config() # Create MLP Engine engine = create_mlp_engine(max_batch_size, builder, config, trt.float32) assert engine # Write the engine into binary file print("[INFO]: Writing engine into binary...") with open(ENGINE_PATH, "wb") as f: # write serialized model in file f.write(engine.serialize()) # free the memory del engine del builder ################################ # INFERENCE RELATED ############ ################################ def perform_inference(input_val): """ Get inference using the pre-trained model :param input_val: a number as an input :return: """ def do_inference(inf_context, inf_host_in, inf_host_out): """ Perform inference using the CUDA context :param inf_context: context created by engine :param inf_host_in: input from the host :param inf_host_out: output to save on host :return: """ inference_engine = inf_context.engine # Input and output bindings are required for inference assert inference_engine.num_bindings == 2 # allocate memory in GPU using CUDA bindings device_in = cuda.mem_alloc(inf_host_in.nbytes) device_out = cuda.mem_alloc(inf_host_out.nbytes) # create bindings for input and output bindings = [int(device_in), int(device_out)] # create CUDA stream for simultaneous CUDA operations stream = cuda.Stream() # copy input from host (CPU) to device (GPU) in stream cuda.memcpy_htod_async(device_in, inf_host_in, stream) # execute inference using context provided by engine inf_context.execute_async(bindings=bindings, stream_handle=stream.handle) # copy output back from device (GPU) to host (CPU) cuda.memcpy_dtoh_async(inf_host_out, device_out, stream) # synchronize the stream to prevent issues # (block CUDA and wait for CUDA operations to be completed) stream.synchronize() # create a runtime (required for deserialization of model) with NVIDIA's logger runtime = trt.Runtime(gLogger) assert runtime # read and deserialize engine for inference with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine # create execution context -- required for inference executions context = engine.create_execution_context() assert context # create input as array data = np.array([input_val], dtype=np.float32) # capture free memory for input in GPU host_in = cuda.pagelocked_empty((INPUT_SIZE), dtype=np.float32) # copy input-array from CPU to Flatten array in GPU np.copyto(host_in, data.ravel()) # capture free memory for output in GPU host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32) # do inference using required parameters do_inference(context, host_in, host_out) print(f'\n[INFO]: Predictions using pre-trained model..\n\tInput:\t{input_val}\n\tOutput:\t{host_out[0]:.4f}') def get_args(): """ Parse command line arguments :return arguments: parsed arguments """ arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-s', action='store_true') arg_parser.add_argument('-d', action='store_true') arguments = vars(arg_parser.parse_args()) # check for the arguments if not (arguments['s'] ^ arguments['d']): print("[ERROR]: Arguments not right!\n") print("\tpython mlp.py -s # serialize model to engine file") print("\tpython mlp.py -d # deserialize engine file and run inference") exit() return arguments if __name__ == "__main__": args = get_args() if args['s']: api_to_model(max_batch_size=1) print("[INFO]: Successfully created TensorRT engine...") print("\n\tRun inference using `python mlp.py -d`\n") else: perform_inference(input_val=4.0) ================================================ FILE: mlp/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static auto loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: mnasnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) project( mnasnet VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 60 70 72 75 80 86 89) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} mnasnet.cpp) target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR} ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart TensorRT::TensorRT ${OpenCV_LIBS}) ================================================ FILE: mnasnet/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: mnasnet/README.md ================================================ # mnasnet MNASNet with depth multiplier of 0.5 from "MnasNet: Platform-Aware Neural Architecture Search for Mobile" For the Pytorch implementation, you can refer to [pytorchx/mnasnet](https://github.com/wang-xinyu/pytorchx/tree/master/mnasnet) Following tricks are used in this mnasnet, nothing special, group conv and batchnorm are used. - Batchnorm layer, implemented by scale layer. ## Usage 1. use `gen_wts.py` to generate wts file ```bash python gen_wts.py ``` 2. build C++ code ```bash pushd tensorrtx/mnasnet cmake -S . -B build -G Ninja --fresh cmake --build build ``` 3. serialize wts model to engine file ```bash ./build/mnasnet -s ``` 4. run inference ```bash ./build/mnasnet -d ``` The output looks like: ```bash ... ==== Execution time: 0ms -2.024, -1.266, -1.602, -1.465, -0.7756, -0.2096, 0.05945, 1.342, -0.2382, 1.279, 1.251, 0.2579, 1.836, -0.5296, 0.3196, 0.9055, -0.4915, 0.1604, -0.6305, -0.1019, -0.8816, ==== prediction result: Top: 0 idx: 285, logits: 4.869, label: Egyptian cat Top: 1 idx: 281, logits: 4.837, label: tabby, tabby cat Top: 2 idx: 282, logits: 4.019, label: tiger cat ``` ================================================ FILE: mnasnet/gen_wts.py ================================================ import struct import cv2 import numpy as np import torch from torchvision.models import mnasnet0_5 MODELS = [("mnasnet0_5", mnasnet0_5(pretrained=True))] def read_imagenet_labels() -> dict[int, str]: """ read ImageNet 1000 labels Returns: dict[int, str]: labels dict """ clsid2label = {} with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f: for i in f.readlines(): k, v = i.split(": ") clsid2label.setdefault(int(k), v[1:-3]) return clsid2label def preprocess(img: np.array) -> torch.Tensor: """ a preprocess method align with ImageNet dataset Args: img (np.array): input image Returns: torch.Tensor: preprocessed image in `NCHW` layout """ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR) mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) img = (img - mean) / std img = img.transpose(2, 0, 1)[None, ...] return torch.from_numpy(img) def main(): labels = read_imagenet_labels() img = preprocess(cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR)) for name, model in MODELS: model.eval() with torch.inference_mode(): output = model(img) for i, batch in enumerate(torch.topk(output, k=3).indices): for j, idx in enumerate(batch): print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}") print(f"{'=' * 32}") with open(f"../models/{name}.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): print("key: ", k) print("value: ", v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") if __name__ == "__main__": main() ================================================ FILE: mnasnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: mnasnet/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: mnasnet/mnasnet.cpp ================================================ #include #include #include #include #include #include #include #include #include #include "logging.h" #include "utils.h" // stuff we know about mnasnet and the input/output blobs static constexpr const int INPUT_H = 224; static constexpr const int INPUT_W = 224; static constexpr const int OUTPUT_SIZE = 1000; static constexpr int N = 1; static constexpr const std::array NAMES = {"data", "prob"}; static constexpr const std::array SIZES = {3 * INPUT_H * INPUT_W, OUTPUT_SIZE}; static const std::string WTS_PATH = "../models/mnasnet0_5.wts"; static const std::string ENGINE_PATH = "../models/mnasnet0_5.engine"; static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt"; static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false; static constexpr const std::array mean = {0.485f, 0.456f, 0.406f}; static constexpr const std::array stdv = {0.229f, 0.224f, 0.225f}; using namespace nvinfer1; using WeightMap = std::map; using M = nvinfer1::MatrixOperation; using E = nvinfer1::ElementWiseOperation; using NDCF = nvinfer1::NetworkDefinitionCreationFlag; static Logger gLogger; struct ConvParams { int o; int k; int s; int p; int d; int g; float eps = 1e-5f; }; struct InvertedResParams { int inch; int o; int k; int s; int exp; }; ILayer* addBatchNorm2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; auto len = weightMap[lname + ".running_var"].count; std::cout << lname << " running_var's len: " << len << "\n"; auto* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; auto* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; auto* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* CBR(INetworkDefinition* net, WeightMap& map, const std::string& name, ITensor& input, const ConvParams& cp, int start_index = 0, bool has_relu = true) { Weights bias{DataType::kFLOAT, nullptr, 0}; // conv -> bn -> relu auto conv_name = name + "." + std::to_string(start_index++) + ".weight"; if (map.find(conv_name) == map.end()) { std::cerr << "KeyError: " << name << "is not in weight map"; std::abort(); } auto* conv = net->addConvolutionNd(input, cp.o, DimsHW{cp.k, cp.k}, map[conv_name], bias); if (conv == nullptr) { std::cerr << "build conv layer failed in " << name; std::abort(); } conv->setStrideNd(DimsHW{cp.s, cp.s}); conv->setPaddingNd(DimsHW{cp.p, cp.p}); conv->setDilationNd(DimsHW{cp.d, cp.d}); conv->setNbGroups(cp.g); conv->setName(conv_name.c_str()); std::string bn_name = name + "." + std::to_string(start_index); auto* bn = addBatchNorm2d(net, map, *conv->getOutput(0), bn_name, cp.eps); if (has_relu) { auto* relu = net->addActivation(*bn->getOutput(0), ActivationType::kRELU); if (relu == nullptr) { std::cerr << "build relu layer failed in " << name; std::abort(); } return relu; } else { return bn; } } ILayer* invertedRes(INetworkDefinition* network, WeightMap& w, ITensor& input, const std::string& lname, const InvertedResParams& irp) { std::cout << "Building layer: " << lname << "\n"; static const Weights emptywts{DataType::kFLOAT, nullptr, 0}; int midch = irp.inch * irp.exp; auto* conv1 = network->addConvolutionNd(input, midch, DimsHW{1, 1}, w[lname + "layers.0.weight"], emptywts); assert(conv1); auto* bn1 = addBatchNorm2d(network, w, *conv1->getOutput(0), lname + "layers.1", 1e-5f); auto* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); auto* conv2 = network->addConvolutionNd(*relu1->getOutput(0), midch, DimsHW{irp.k, irp.k}, w[lname + "layers.3.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{irp.s, irp.s}); conv2->setPaddingNd(DimsHW{irp.k / 2, irp.k / 2}); conv2->setNbGroups(midch); auto* bn2 = addBatchNorm2d(network, w, *conv2->getOutput(0), lname + "layers.4", 1e-5f); auto* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); auto* conv3 = network->addConvolutionNd(*relu2->getOutput(0), irp.o, DimsHW{1, 1}, w[lname + "layers.6.weight"], emptywts); assert(conv3); auto* bn3 = addBatchNorm2d(network, w, *conv3->getOutput(0), lname + "layers.7", 1e-5f); if (irp.inch == irp.o && irp.s == 1) { auto* ew1 = network->addElementWise(*bn3->getOutput(0), input, ElementWiseOperation::kSUM); assert(ew1); return ew1; } return bn3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { auto weightMap = loadWeights(WTS_PATH); #if TRT_VERSION >= 11200 auto flag = 1U << static_cast(NDCF::kSTRONGLY_TYPED); #elif TRT_VERSION >= 10000 auto flag = 0U; #else auto flag = 1U << static_cast(NDCF::kEXPLICIT_BATCH); #endif auto* network = builder->createNetworkV2(flag); ITensor* data{nullptr}; if constexpr (TRT_PREPROCESS) { dt = DataType::kUINT8; data = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3}); auto* trans = addTransformLayer(network, *data, true, mean, stdv); data = trans->getOutput(0); } else { data = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W}); } assert(data); int start_idx = 0; auto* cbr_0 = CBR(network, weightMap, "layers", *data, {16, 3, 2, 1, 1, 1}, start_idx, true); start_idx += 3; auto* cbr_1 = CBR(network, weightMap, "layers", *cbr_0->getOutput(0), {16, 3, 1, 1, 1, 16}, start_idx, true); start_idx += 3; auto* cbr_2 = CBR(network, weightMap, "layers", *cbr_1->getOutput(0), {8, 1, 1, 1, 1, 1}, start_idx, false); ILayer* ir1 = invertedRes(network, weightMap, *cbr_2->getOutput(0), "layers.8.0.", {8, 16, 3, 2, 3}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.8.1.", {16, 16, 3, 1, 3}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.8.2.", {16, 16, 3, 1, 3}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.9.0.", {16, 24, 5, 2, 3}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.9.1.", {24, 24, 5, 1, 3}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.9.2.", {24, 24, 5, 1, 3}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.10.0.", {24, 40, 5, 2, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.10.1.", {40, 40, 5, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.10.2.", {40, 40, 5, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.11.0.", {40, 48, 3, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.11.1.", {48, 48, 3, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.0.", {48, 96, 5, 2, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.1.", {96, 96, 5, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.2.", {96, 96, 5, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.12.3.", {96, 96, 5, 1, 6}); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "layers.13.0.", {96, 160, 3, 1, 6}); auto* cbr_3 = CBR(network, weightMap, "layers", *ir1->getOutput(0), {1280, 1, 1, 0, 1, 1}, 14, true); auto* avg = network->addReduce(*cbr_3->getOutput(0), ReduceOperation::kAVG, 0xc, false); auto* _fcw = network->addConstant(DimsHW{1000, 1280}, weightMap["classifier.1.weight"]); auto* _fcb = network->addConstant(DimsHW{1, 1000}, weightMap["classifier.1.bias"]); auto* _fc1 = network->addMatrixMultiply(*avg->getOutput(0), M::kNONE, *_fcw->getOutput(0), M::kTRANSPOSE); auto* fc1 = network->addElementWise(*_fc1->getOutput(0), *_fcb->getOutput(0), E::kSUM); assert(fc1); fc1->getOutput(0)->setName(NAMES[1]); network->markOutput(*fc1->getOutput(0)); // Build engine #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); auto* _serialized = builder->buildSerializedNetwork(*network, *config); auto* engine = runtime->deserializeCudaEngine(_serialized->data(), _serialized->size()); delete _serialized; delete network; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); auto* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif std::cout << "build out\n"; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IRuntime* runtime, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } std::vector> do_inference(IExecutionContext& context, void* input, std::size_t batch_size) { const ICudaEngine& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { std::size_t size = 0; #if TRT_VERSION >= 8000 auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); size = s * batch_size * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); size = s * batch_size * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batch_size * SIZES[i], std::nanf("")); std::size_t size = batch_size * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); cudaStreamDestroy(stream); for (auto i = 0; i < nIO; ++i) { CHECK(cudaFree(buffers[i])); } return prob; } int main(int argc, char** argv) { checkTrtEnv(); if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./mnasnet -s // serialize model to plan file\n"; std::cerr << "./mnasnet -d // deserialize plan file and run inference\n"; return -1; } auto* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); // create a model using the API directly and serialize it to a stream char* trt_model_stream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(N, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trt_model_stream = new char[size]; assert(trt_model_stream); file.read(trt_model_stream, size); file.close(); } } else { return -1; } #if TRT_VERSION >= 8000 auto* engine = runtime->deserializeCudaEngine(trt_model_stream, size); #else auto* engine = runtime->deserializeCudaEngine(trt_model_stream, size, nullptr); #endif assert(engine != nullptr); auto* context = engine->createExecutionContext(); assert(context != nullptr); void* input = nullptr; std::vector flat_img; cv::Mat img; if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR); input = static_cast(img.data); } else { img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W); input = flat_img.data(); } for (int32_t i = 0; i < 100; ++i) { auto _start = std::chrono::system_clock::now(); auto prob = do_inference(*context, input, 1); auto _end = std::chrono::system_clock::now(); auto _time = std::chrono::duration_cast(_end - _start).count(); std::cout << "Execution time: " << _time << "ms\n"; for (const auto& vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====\n"; break; } } } if (i == 99) { std::cout << "prediction result:\n"; auto labels = loadImagenetLabelMap(LABELS_PATH); int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << labels[idx] << "\n"; } } } delete[] trt_model_stream; return 0; } ================================================ FILE: mnasnet/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static std::map loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean subtract mean * @param std divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in float32 type */ static std::vector preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array& mean, const std::array& std, int n, int h, int w) { const auto c = img.channels(); const auto size = c * h * w; if (c != 3) { std::cerr << "this demo only supports 3 channel input image.\n"; std::abort(); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); img.convertTo(img, CV_32FC3, 1.f / 255); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(static_cast(n) * c * h * w, 0.f); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = v[0]; chw[i * size + 1 * h * w + y * w + x] = v[1]; chw[i * size + 2 * h * w + y * w + x] = v[2]; } } } return chw; } static auto topk(const std::vector& v, int k) -> std::vector> { if (k <= 0) return {}; auto stride = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(stride); for (auto i = 0; i < stride; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static std::map loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const std::array& mean, const std::array& std) { struct ScaleParams { std::array shift; std::array scale; }; static std::vector> gScaleParams; auto params = std::make_unique(); params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]}; params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)}; static const Weights empty{DataType::kFLOAT, nullptr, 0ll}; const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll}; const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll}; gScaleParams.emplace_back(std::move(params)); ITensor* in = &input; if (input.getType() != DataType::kFLOAT) { #if TRT_VERSION >= 8000 auto* cast = network->addCast(input, DataType::kFLOAT); assert(cast); cast->setName("Cast to FP32"); in = cast->getOutput(0); #else auto* identity = network->addIdentity(input); assert(identity); identity->setName("Convert to FP32"); identity->setOutputType(0, DataType::kFLOAT); in = identity->getOutput(0); #endif } // Convert from NHWC to NCHW auto* perm = network->addShuffle(*in); assert(perm); perm->setName("NHWC -> NCHW"); perm->setFirstTranspose(Permutation{0, 3, 1, 2}); // Convert from BGR to RGB (optional) ITensor* data{nullptr}; if (bgr2rgb) { auto add_slice = [&](int c, const char* name) -> ITensor* { auto dims = perm->getOutput(0)->getDimensions(); Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1}; Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]}; auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride); _slice->setName(name); assert(_slice && _slice->getNbOutputs() == 1); return _slice->getOutput(0); }; std::array channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")}; auto* cat = network->addConcatenation(channels.data(), 3); assert(cat); cat->setName("RGB"); cat->setAxis(1); data = cat->getOutput(0); } else { data = perm->getOutput(0); } // Normalize auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty); assert(trans); trans->setName("mean & std"); #if TRT_VERSION >= 8000 trans->setChannelAxis(1); #endif return trans; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: mobilenet/mobilenetv2/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(mobilenet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(mobilenet ${PROJECT_SOURCE_DIR}/mobilenet_v2.cpp) target_link_libraries(mobilenet nvinfer) target_link_libraries(mobilenet cudart) add_definitions(-O2 -pthread) ================================================ FILE: mobilenet/mobilenetv2/README.md ================================================ # mobilenet v2 MobileNetV2 architecture from "MobileNetV2: Inverted Residuals and Linear Bottlenecks" . For the Pytorch implementation, you can refer to [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet) Following tricks are used in this mobilenet, - Relu6 is used in mobilenet v2. We use `Relu6(x) = Relu(x) - Relu(x-6)` in tensorrt. - Batchnorm layer, implemented by scale layer. ``` // 1. generate mobilenet.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet) // 2. put mobilenet.wts into tensorrtx/mobilenet // 3. build and run cd tensorrtx/mobilenet/mobilenetv2 mkdir build cd build cmake .. make sudo ./mobilenet -s // serialize model to plan file i.e. 'mobilenet.engine' sudo ./mobilenet -d // deserialize plan file and run inference // 4. see if the output is same as pytorchx/mobilenet ``` ### TensorRT Python API ``` # 1. generate mobilenetv2.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet) # 2. put mobilenetv2.wts into tensorrtx/mobilenet/mobilenetv2 # 3. install Python dependencies (tensorrt/pycuda/numpy) cd tensorrtx/mobilenet/mobilenetv2 python mobilenet_v2.py -s // serialize model to plan file i.e. 'mobilenetv2.engine' python mobilenet_v2.py -d // deserialize plan file and run inference # 4. see if the output is same as pytorchx/mobilenet ``` ================================================ FILE: mobilenet/mobilenetv2/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: mobilenet/mobilenetv2/mobilenet_v2.cpp ================================================ #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != 0) { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IElementWiseLayer* convBnRelu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int p = (ksize - 1) / 2; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + "0.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(g); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); float* shval = reinterpret_cast(malloc(sizeof(float) * 1)); float* scval = reinterpret_cast(malloc(sizeof(float) * 1)); float* pval = reinterpret_cast(malloc(sizeof(float) * 1)); shval[0] = -6.0; scval[0] = 1.0; pval[0] = 1.0; Weights shift{DataType::kFLOAT, shval, 1}; Weights scale{DataType::kFLOAT, scval, 1}; Weights power{DataType::kFLOAT, pval, 1}; weightMap[lname + "cbr.scale"] = scale; weightMap[lname + "cbr.shift"] = shift; weightMap[lname + "cbr.power"] = power; IScaleLayer* scale1 = network->addScale(*bn1->getOutput(0), ScaleMode::kUNIFORM, shift, scale, power); assert(scale1); IActivationLayer* relu2 = network->addActivation(*scale1->getOutput(0), ActivationType::kRELU); assert(relu2); IElementWiseLayer* ew1 = network->addElementWise(*relu1->getOutput(0), *relu2->getOutput(0), ElementWiseOperation::kSUB); assert(ew1); return ew1; } ILayer* invertedRes(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, int inch, int outch, int s, int exp) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int hidden = inch * exp; bool use_res_connect = (s == 1 && inch == outch); IScaleLayer* bn1 = nullptr; if (exp != 1) { IElementWiseLayer* ew1 = convBnRelu(network, weightMap, input, hidden, 1, 1, 1, lname + "conv.0."); IElementWiseLayer* ew2 = convBnRelu(network, weightMap, *ew1->getOutput(0), hidden, 3, s, hidden, lname + "conv.1."); IConvolutionLayer* conv1 = network->addConvolutionNd(*ew2->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + "conv.2.weight"], emptywts); assert(conv1); bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "conv.3", 1e-5); } else { IElementWiseLayer* ew1 = convBnRelu(network, weightMap, input, hidden, 3, s, hidden, lname + "conv.0."); IConvolutionLayer* conv1 = network->addConvolutionNd(*ew1->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + "conv.1.weight"], emptywts); assert(conv1); bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "conv.2", 1e-5); } if (!use_res_connect) return bn1; IElementWiseLayer* ew3 = network->addElementWise(input, *bn1->getOutput(0), ElementWiseOperation::kSUM); assert(ew3); return ew3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../mobilenet.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto ew1 = convBnRelu(network, weightMap, *data, 32, 3, 2, 1, "features.0."); ILayer* ir1 = invertedRes(network, weightMap, *ew1->getOutput(0), "features.1.", 32, 16, 1, 1); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.2.", 16, 24, 2, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.3.", 24, 24, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.4.", 24, 32, 2, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.5.", 32, 32, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.6.", 32, 32, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.7.", 32, 64, 2, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.8.", 64, 64, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.9.", 64, 64, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.10.", 64, 64, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.11.", 64, 96, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.12.", 96, 96, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.13.", 96, 96, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.14.", 96, 160, 2, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.15.", 160, 160, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.16.", 160, 160, 1, 6); ir1 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.17.", 160, 320, 1, 6); IElementWiseLayer* ew2 = convBnRelu(network, weightMap, *ir1->getOutput(0), 1280, 1, 1, 1, "features.18."); IPoolingLayer* pool1 = network->addPoolingNd(*ew2->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool1); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool1->getOutput(0), 1000, weightMap["classifier.1.weight"], weightMap["classifier.1.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); config->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./mobilenet -s // serialize model to plan file" << std::endl; std::cerr << "./mobilenet -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char* trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("mobilenet.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("mobilenet.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { std::cout << prob[i] << ", "; if (i % 10 == 0) std::cout << i / 10 << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: mobilenet/mobilenetv2/mobilenet_v2.py ================================================ import os import sys import struct import argparse import numpy as np import pycuda.driver as cuda import pycuda.autoinit # noqa: F401 import tensorrt as trt BATCH_SIZE = 1 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 1000 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" EPS = 1e-5 WEIGHT_PATH = "./mobilenetv2.wts" ENGINE_PATH = "./mobilenetv2.engine" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def add_batch_norm_2d(network, weight_map, input, layer_name, eps): gamma = weight_map[layer_name + ".weight"] beta = weight_map[layer_name + ".bias"] mean = weight_map[layer_name + ".running_mean"] var = weight_map[layer_name + ".running_var"] var = np.sqrt(var + eps) scale = gamma / var shift = -mean / var * gamma + beta return network.add_scale(input=input, mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale) def conv_bn_relu(network, weight_map, input, outch, ksize, s, g, lname): p = (ksize - 1) // 2 conv1 = network.add_convolution(input=input, num_output_maps=outch, kernel_shape=(ksize, ksize), kernel=weight_map[lname + "0.weight"], bias=trt.Weights()) assert conv1 conv1.stride = (s, s) conv1.padding = (p, p) conv1.num_groups = g bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 shift = np.array(-6.0, dtype=np.float32) scale = np.array(1.0, dtype=np.float32) power = np.array(1.0, dtype=np.float32) scale1 = network.add_scale(input=bn1.get_output(0), mode=trt.ScaleMode.UNIFORM, shift=shift, scale=scale, power=power) assert scale1 relu2 = network.add_activation(scale1.get_output(0), type=trt.ActivationType.RELU) assert relu2 ew1 = network.add_elementwise(relu1.get_output(0), relu2.get_output(0), trt.ElementWiseOperation.SUB) assert ew1 return ew1 def inverted_res(network, weight_map, input, lname, inch, outch, s, exp): hidden = inch * exp use_res_connect = (s == 1 and inch == outch) if exp != 1: ew1 = conv_bn_relu(network, weight_map, input, hidden, 1, 1, 1, lname + "conv.0.") ew2 = conv_bn_relu(network, weight_map, ew1.get_output(0), hidden, 3, s, hidden, lname + "conv.1.") conv1 = network.add_convolution(input=ew2.get_output(0), num_output_maps=outch, kernel_shape=(1, 1), kernel=weight_map[lname + "conv.2.weight"], bias=trt.Weights()) assert conv1 bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "conv.3", EPS) else: ew1 = conv_bn_relu(network, weight_map, input, hidden, 3, s, hidden, lname + "conv.0.") conv1 = network.add_convolution(input=ew1.get_output(0), num_output_maps=outch, kernel_shape=(1, 1), kernel=weight_map[lname + "conv.1.weight"], bias=trt.Weights()) assert conv1 bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "conv.2", EPS) if not use_res_connect: return bn1 ew3 = network.add_elementwise(input, bn1.get_output(0), trt.ElementWiseOperation.SUM) assert ew3 return ew3 def create_engine(max_batch_size, builder, config, dt): weight_map = load_weights(WEIGHT_PATH) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data ew1 = conv_bn_relu(network, weight_map, data, 32, 3, 2, 1, "features.0.") ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 32, 16, 1, 1) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.3.", 24, 24, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.4.", 24, 32, 2, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.5.", 32, 32, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.6.", 32, 32, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.7.", 32, 64, 2, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.8.", 64, 64, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.9.", 64, 64, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.10.", 64, 64, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.11.", 64, 96, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.12.", 96, 96, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.13.", 96, 96, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.14.", 96, 160, 2, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.15.", 160, 160, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.16.", 160, 160, 1, 6) ir1 = inverted_res(network, weight_map, ir1.get_output(0), "features.17.", 160, 320, 1, 6) ew2 = conv_bn_relu(network, weight_map, ir1.get_output(0), 1280, 1, 1, 1, "features.18.") pool1 = network.add_pooling(input=ew2.get_output(0), type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(7, 7)) assert pool1 fc1 = network.add_fully_connected(input=pool1.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map["classifier.1.weight"], bias=weight_map["classifier.1.bias"]) assert fc1 fc1.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(fc1.get_output(0)) # Build Engine builder.max_batch_size = max_batch_size builder.max_workspace_size = 1 << 32 engine = builder.build_engine(network, config) del network del weight_map return engine def API_to_model(max_batch_size): builder = trt.Builder(TRT_LOGGER) config = builder.create_builder_config() engine = create_engine(max_batch_size, builder, config, trt.float32) assert engine with open(ENGINE_PATH, "wb") as f: f.write(engine.serialize()) del engine del builder del config class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') args = parser.parse_args() if not (args.s ^ args.d): print( "arguments not right!\n" "python mobilenet_v2.py -s # serialize model to plan file\n" "python mobilenet_v2.py -d # deserialize plan file and run inference" ) sys.exit() if args.s: API_to_model(BATCH_SIZE) else: runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32) inputs, outputs, bindings, stream = allocate_buffers(engine) inputs[0].host = data trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}') ================================================ FILE: mobilenet/mobilenetv3/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(mobilenetv3) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(mobilenetv3 ${PROJECT_SOURCE_DIR}/mobilenet_v3.cpp) target_link_libraries(mobilenetv3 nvinfer) target_link_libraries(mobilenetv3 cudart) add_definitions(-O2 -pthread) ================================================ FILE: mobilenet/mobilenetv3/README.md ================================================ # mobilenet v3 MobileNetV3 architecture from "Searching for MobileNetV3" . For the Pytorch implementation, you can refer to [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch) ## Run 1. generate mbv3_small.wts/mbv3_large.wts from pytorch implementation 2. put mbv3_small.wts/mbv3_large.wts into tensorrtx/mobilenet/mobilenetv3 3. build and run ``` cd tensorrtx/mobilenet/mobilenetv3 mkdir build cd build cmake .. make sudo ./mobilenetv3 -s small(or large) // serialize model to plan file i.e. 'mobilenetv3_small.engine' sudo ./mobilenetv3 -d small(or large) // deserialize plan file and run inference ``` 4. see if the output is same as pytorch side ### TensorRT Python API ``` # 1. generate mobilenetv3.wts from [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch) # 2. put mobilenetv3.wts into tensorrtx/mobilenet/mobilenetv3 # 3. install Python dependencies (tensorrt/pycuda/numpy) cd tensorrtx/mobilenet/mobilenetv3 python mobilenet_v2.py -s small(or large) // serialize model to plan file i.e. 'mobilenetv2.engine' python mobilenet_v2.py -d small(or large) // deserialize plan file and run inference ``` ================================================ FILE: mobilenet/mobilenetv3/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: mobilenet/mobilenetv3/mobilenet_v3.cpp ================================================ #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != 0) { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; static const int BS = 1; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* hSwish(INetworkDefinition* network, ITensor& input, std::string name) { auto hsig = network->addActivation(input, ActivationType::kHARD_SIGMOID); assert(hsig); hsig->setAlpha(1.0 / 6.0); hsig->setBeta(0.5); ILayer* hsw = network->addElementWise(input, *hsig->getOutput(0), ElementWiseOperation::kPROD); assert(hsw); return hsw; } ILayer* convBnHswish(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int p = (ksize - 1) / 2; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + "0.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(g); IScaleLayer* bn1 = addBatchNorm(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5); ILayer* hsw = hSwish(network, *bn1->getOutput(0), lname + "2"); assert(hsw); return hsw; } ILayer* seLayer(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c, int w, std::string lname) { int h = w; IPoolingLayer* l1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW(w, h)); assert(l1); l1->setStrideNd(DimsHW{w, h}); IFullyConnectedLayer* l2 = network->addFullyConnected( *l1->getOutput(0), BS * c / 4, weightMap[lname + "fc.0.weight"], weightMap[lname + "fc.0.bias"]); IActivationLayer* relu1 = network->addActivation(*l2->getOutput(0), ActivationType::kRELU); IFullyConnectedLayer* l4 = network->addFullyConnected( *relu1->getOutput(0), BS * c, weightMap[lname + "fc.2.weight"], weightMap[lname + "fc.2.bias"]); auto hsig = network->addActivation(*l4->getOutput(0), ActivationType::kHARD_SIGMOID); assert(hsig); hsig->setAlpha(1.0 / 6.0); hsig->setBeta(0.5); ILayer* se = network->addElementWise(input, *hsig->getOutput(0), ElementWiseOperation::kPROD); assert(se); return se; } ILayer* convSeq1(INetworkDefinition* network, std::map& weightMap, ITensor& input, int output, int hdim, int k, int s, bool use_se, bool use_hs, int w, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int p = (k - 1) / 2; IConvolutionLayer* conv1 = network->addConvolutionNd(input, hdim, DimsHW{k, k}, weightMap[lname + "0.weight"], emptywts); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); conv1->setNbGroups(hdim); IScaleLayer* bn1 = addBatchNorm(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5); ITensor *tensor3, *tensor4; tensor3 = nullptr; tensor4 = nullptr; if (use_hs) { ILayer* hsw = hSwish(network, *bn1->getOutput(0), lname + "2"); tensor3 = hsw->getOutput(0); } else { IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); tensor3 = relu1->getOutput(0); } if (use_se) { ILayer* se1 = seLayer(network, weightMap, *tensor3, hdim, w, lname + "3."); tensor4 = se1->getOutput(0); } else { tensor4 = tensor3; } IConvolutionLayer* conv2 = network->addConvolutionNd(*tensor4, output, DimsHW{1, 1}, weightMap[lname + "4.weight"], emptywts); IScaleLayer* bn2 = addBatchNorm(network, weightMap, *conv2->getOutput(0), lname + "5", 1e-5); assert(bn2); return bn2; } ILayer* convSeq2(INetworkDefinition* network, std::map& weightMap, ITensor& input, int output, int hdim, int k, int s, bool use_se, bool use_hs, int w, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int p = (k - 1) / 2; IConvolutionLayer* conv1 = network->addConvolutionNd(input, hdim, DimsHW{1, 1}, weightMap[lname + "0.weight"], emptywts); IScaleLayer* bn1 = addBatchNorm(network, weightMap, *conv1->getOutput(0), lname + "1", 1e-5); ITensor *tensor3, *tensor6, *tensor7; tensor3 = nullptr; tensor6 = nullptr; tensor7 = nullptr; if (use_hs) { ILayer* hsw1 = hSwish(network, *bn1->getOutput(0), lname + "2"); tensor3 = hsw1->getOutput(0); } else { IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); tensor3 = relu1->getOutput(0); } IConvolutionLayer* conv2 = network->addConvolutionNd(*tensor3, hdim, DimsHW{k, k}, weightMap[lname + "3.weight"], emptywts); conv2->setStrideNd(DimsHW{s, s}); conv2->setPaddingNd(DimsHW{p, p}); conv2->setNbGroups(hdim); IScaleLayer* bn2 = addBatchNorm(network, weightMap, *conv2->getOutput(0), lname + "4", 1e-5); if (use_se) { ILayer* se1 = seLayer(network, weightMap, *bn2->getOutput(0), hdim, w, lname + "5."); tensor6 = se1->getOutput(0); } else { tensor6 = bn2->getOutput(0); } if (use_hs) { ILayer* hsw2 = hSwish(network, *tensor6, lname + "6"); tensor7 = hsw2->getOutput(0); } else { IActivationLayer* relu2 = network->addActivation(*tensor6, ActivationType::kRELU); tensor7 = relu2->getOutput(0); } IConvolutionLayer* conv3 = network->addConvolutionNd(*tensor7, output, DimsHW{1, 1}, weightMap[lname + "7.weight"], emptywts); IScaleLayer* bn3 = addBatchNorm(network, weightMap, *conv3->getOutput(0), lname + "8", 1e-5); assert(bn3); return bn3; } ILayer* invertedRes(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, int inch, int outch, int s, int hidden, int k, bool use_se, bool use_hs, int w) { bool use_res_connect = (s == 1 && inch == outch); ILayer* conv = nullptr; if (inch == hidden) { conv = convSeq1(network, weightMap, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv."); } else { conv = convSeq2(network, weightMap, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv."); } if (!use_res_connect) return conv; IElementWiseLayer* ew3 = network->addElementWise(input, *conv->getOutput(0), ElementWiseOperation::kSUM); assert(ew3); return ew3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngineSmall(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../mbv3_small.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; //auto test1 = network->addActivation(*data, ActivationType::kRELU); auto ew1 = convBnHswish(network, weightMap, *data, 16, 3, 2, 1, "features.0."); auto ir1 = invertedRes(network, weightMap, *ew1->getOutput(0), "features.1.", 16, 16, 2, 16, 3, 1, 0, 56); auto ir2 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.2.", 16, 24, 2, 72, 3, 0, 0, 28); auto ir3 = invertedRes(network, weightMap, *ir2->getOutput(0), "features.3.", 24, 24, 1, 88, 3, 0, 0, 28); auto ir4 = invertedRes(network, weightMap, *ir3->getOutput(0), "features.4.", 24, 40, 2, 96, 5, 1, 1, 14); auto ir5 = invertedRes(network, weightMap, *ir4->getOutput(0), "features.5.", 40, 40, 1, 240, 5, 1, 1, 14); auto ir6 = invertedRes(network, weightMap, *ir5->getOutput(0), "features.6.", 40, 40, 1, 240, 5, 1, 1, 14); auto ir7 = invertedRes(network, weightMap, *ir6->getOutput(0), "features.7.", 40, 48, 1, 120, 5, 1, 1, 14); auto ir8 = invertedRes(network, weightMap, *ir7->getOutput(0), "features.8.", 48, 48, 1, 144, 5, 1, 1, 14); auto ir9 = invertedRes(network, weightMap, *ir8->getOutput(0), "features.9.", 48, 96, 2, 288, 5, 1, 1, 7); auto ir10 = invertedRes(network, weightMap, *ir9->getOutput(0), "features.10.", 96, 96, 1, 576, 5, 1, 1, 7); auto ir11 = invertedRes(network, weightMap, *ir10->getOutput(0), "features.11.", 96, 96, 1, 576, 5, 1, 1, 7); ILayer* ew2 = convBnHswish(network, weightMap, *ir11->getOutput(0), 576, 1, 1, 1, "conv.0."); ILayer* se1 = seLayer(network, weightMap, *ew2->getOutput(0), 576, 7, "conv.1."); IPoolingLayer* pool1 = network->addPoolingNd(*se1->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool1); pool1->setStrideNd(DimsHW{7, 7}); ILayer* sw1 = hSwish(network, *pool1->getOutput(0), "hSwish.0"); IFullyConnectedLayer* fc1 = network->addFullyConnected(*sw1->getOutput(0), 1280, weightMap["classifier.0.weight"], weightMap["classifier.0.bias"]); assert(fc1); ILayer* bn1 = addBatchNorm(network, weightMap, *fc1->getOutput(0), "classifier.1", 1e-5); ILayer* sw2 = hSwish(network, *bn1->getOutput(0), "hSwish.1"); IFullyConnectedLayer* fc2 = network->addFullyConnected(*sw2->getOutput(0), 1000, weightMap["classifier.3.weight"], weightMap["classifier.3.bias"]); ILayer* bn2 = addBatchNorm(network, weightMap, *fc2->getOutput(0), "classifier.4", 1e-5); ILayer* sw3 = hSwish(network, *bn2->getOutput(0), "hSwish.2"); sw3->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*sw3->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } ICudaEngine* createEngineLarge(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../mbv3_large.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; //auto test1 = network->addActivation(*data, ActivationType::kRELU); auto ew1 = convBnHswish(network, weightMap, *data, 16, 3, 2, 1, "features.0."); auto ir1 = invertedRes(network, weightMap, *ew1->getOutput(0), "features.1.", 16, 16, 1, 16, 3, 0, 0, 112); auto ir2 = invertedRes(network, weightMap, *ir1->getOutput(0), "features.2.", 16, 24, 2, 64, 3, 0, 0, 56); auto ir3 = invertedRes(network, weightMap, *ir2->getOutput(0), "features.3.", 24, 24, 1, 72, 3, 0, 0, 56); auto ir4 = invertedRes(network, weightMap, *ir3->getOutput(0), "features.4.", 24, 40, 2, 72, 5, 1, 0, 28); auto ir5 = invertedRes(network, weightMap, *ir4->getOutput(0), "features.5.", 40, 40, 1, 120, 5, 1, 0, 28); auto ir6 = invertedRes(network, weightMap, *ir5->getOutput(0), "features.6.", 40, 40, 1, 120, 5, 1, 0, 28); auto ir7 = invertedRes(network, weightMap, *ir6->getOutput(0), "features.7.", 40, 80, 2, 240, 3, 0, 1, 14); auto ir8 = invertedRes(network, weightMap, *ir7->getOutput(0), "features.8.", 80, 80, 1, 200, 3, 0, 1, 14); auto ir9 = invertedRes(network, weightMap, *ir8->getOutput(0), "features.9.", 80, 80, 1, 184, 3, 0, 1, 14); auto ir10 = invertedRes(network, weightMap, *ir9->getOutput(0), "features.10.", 80, 80, 1, 184, 3, 0, 1, 14); auto ir11 = invertedRes(network, weightMap, *ir10->getOutput(0), "features.11.", 80, 112, 1, 480, 3, 1, 1, 14); auto ir12 = invertedRes(network, weightMap, *ir11->getOutput(0), "features.12.", 112, 112, 1, 672, 3, 1, 1, 14); auto ir13 = invertedRes(network, weightMap, *ir12->getOutput(0), "features.13.", 112, 160, 1, 672, 5, 1, 1, 14); auto ir14 = invertedRes(network, weightMap, *ir13->getOutput(0), "features.14.", 160, 160, 2, 672, 5, 1, 1, 7); auto ir15 = invertedRes(network, weightMap, *ir14->getOutput(0), "features.15.", 160, 160, 1, 960, 5, 1, 1, 7); ILayer* ew2 = convBnHswish(network, weightMap, *ir15->getOutput(0), 960, 1, 1, 1, "conv.0."); IPoolingLayer* pool1 = network->addPoolingNd(*ew2->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool1); pool1->setStrideNd(DimsHW{7, 7}); ILayer* sw1 = hSwish(network, *pool1->getOutput(0), "hSwish.0"); IFullyConnectedLayer* fc1 = network->addFullyConnected(*sw1->getOutput(0), 1280, weightMap["classifier.0.weight"], weightMap["classifier.0.bias"]); assert(fc1); ILayer* sw2 = hSwish(network, *fc1->getOutput(0), "hSwish.1"); IFullyConnectedLayer* fc2 = network->addFullyConnected(*sw2->getOutput(0), 1000, weightMap["classifier.3.weight"], weightMap["classifier.3.bias"]); fc2->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc2->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string mode) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine; if (mode == "small") { std::cout << "create engine small" << std::endl; engine = createEngineSmall(maxBatchSize, builder, config, DataType::kFLOAT); } else if (mode == "large") { engine = createEngineLarge(maxBatchSize, builder, config, DataType::kFLOAT); } assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); config->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 3) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./mobilenet -s small // serialize small model to plan file" << std::endl; std::cerr << "./mobilenet -s large // serialize large model to plan file" << std::endl; std::cerr << "./mobilenet -d small // deserialize small model plan file and run inference" << std::endl; std::cerr << "./mobilenet -d large // deserialize large model plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char* trtModelStream{nullptr}; size_t size{0}; std::string mode = std::string(argv[2]); std::cout << mode << std::endl; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream, mode); assert(modelStream != nullptr); std::ofstream p("mobilenetv3_" + mode + ".engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("mobilenetv3_" + mode + ".engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 10; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { std::cout << prob[i] << ", "; //if (i % 10 == 0) std::cout << i / 10 << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: mobilenet/mobilenetv3/mobilenet_v3.py ================================================ import os import sys import struct import argparse import numpy as np import pycuda.driver as cuda import pycuda.autoinit # noqa: F401 import tensorrt as trt BATCH_SIZE = 1 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 1000 BS = 1 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" EPS = 1e-5 WEIGHT_PATH_SMALL = "./mobilenetv3.wts" ENGINE_PATH = "./mobilenetv3.engine" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def add_batch_norm_2d(network, weight_map, input, layer_name, eps): gamma = weight_map[layer_name + ".weight"] beta = weight_map[layer_name + ".bias"] mean = weight_map[layer_name + ".running_mean"] var = weight_map[layer_name + ".running_var"] var = np.sqrt(var + eps) scale = gamma / var shift = -mean / var * gamma + beta return network.add_scale(input=input, mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale) def add_h_swish(network, input): h_sig = network.add_activation(input, type=trt.ActivationType.HARD_SIGMOID) assert h_sig h_sig.alpha = 1.0 / 6.0 h_sig.beta = 0.5 hsw = network.add_elementwise(input, h_sig.get_output(0), trt.ElementWiseOperation.PROD) assert hsw return hsw def conv_bn_h_swish(network, weight_map, input, outch, ksize, s, g, lname): p = (ksize - 1) // 2 conv1 = network.add_convolution(input=input, num_output_maps=outch, kernel_shape=(ksize, ksize), kernel=weight_map[lname + "0.weight"], bias=trt.Weights() ) assert conv1 conv1.stride = (s, s) conv1.padding = (p, p) conv1.num_groups = g bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS) hsw = add_h_swish(network, bn1.get_output(0)) assert hsw return hsw def add_se_layer(network, weight_map, input, c, w, lname): h = w l1 = network.add_pooling(input=input, type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(w, h)) assert l1 l1.stride_nd = (w, h) l2 = network.add_fully_connected(input=l1.get_output(0), num_outputs=BS * c // 4, kernel=weight_map[lname + "fc.0.weight"], bias=weight_map[lname + "fc.0.bias"]) relu1 = network.add_activation(l2.get_output(0), type=trt.ActivationType.RELU) l4 = network.add_fully_connected(input=relu1.get_output(0), num_outputs=BS * c, kernel=weight_map[lname + "fc.2.weight"], bias=weight_map[lname + "fc.2.bias"]) se = add_h_swish(network, l4.get_output(0)) return se def conv_seq_1(network, weight_map, input, output, hdim, k, s, use_se, use_hs, w, lname): p = (k - 1) // 2 conv1 = network.add_convolution(input=input, num_output_maps=hdim, kernel_shape=(k, k), kernel=weight_map[lname + "0.weight"], bias=trt.Weights()) assert conv1 conv1.stride = (s, s) conv1.padding = (p, p) conv1.num_groups = hdim bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS) if use_hs: hsw = add_h_swish(network, bn1.get_output(0)) tensor3 = hsw.get_output(0) else: relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) tensor3 = relu1.get_output(0) if use_se: se1 = add_se_layer(network, weight_map, tensor3, hdim, w, lname + "3.") tensor4 = se1.get_output(0) else: tensor4 = tensor3 conv2 = network.add_convolution(input=tensor4, num_output_maps=output, kernel_shape=(1, 1), kernel=weight_map[lname + "4.weight"], bias=trt.Weights()) bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), lname + "5", EPS) assert bn2 return bn2 def conv_seq_2(network, weight_map, input, output, hdim, k, s, use_se, use_hs, w, lname): p = (k - 1) // 2 conv1 = network.add_convolution(input=input, num_output_maps=hdim, kernel_shape=(1, 1), kernel=weight_map[lname + "0.weight"], bias=trt.Weights()) bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), lname + "1", EPS) if use_hs: hsw1 = add_h_swish(network, bn1.get_output(0)) tensor3 = hsw1.get_output(0) else: relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) tensor3 = relu1.get_output(0) conv2 = network.add_convolution(input=tensor3, num_output_maps=hdim, kernel_shape=(k, k), kernel=weight_map[lname + "3.weight"], bias=trt.Weights()) conv2.stride = (s, s) conv2.padding = (p, p) conv2.num_groups = hdim bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), lname + "4", EPS) if use_se: se1 = add_se_layer(network, weight_map, bn2.get_output(0), hdim, w, lname + "5.") tensor6 = se1.get_output(0) else: tensor6 = bn2.get_output(0) if use_hs: hsw2 = add_h_swish(network, tensor6) tensor7 = hsw2.get_output(0) else: relu2 = network.add_activation(tensor6, type=trt.ActivationType.RELU) tensor7 = relu2.get_output(0) conv3 = network.add_convolution(input=tensor7, num_output_maps=output, kernel_shape=(1, 1), kernel=weight_map[lname + "7.weight"], bias=trt.Weights()) bn3 = add_batch_norm_2d(network, weight_map, conv3.get_output(0), lname + "8", EPS) assert bn3 return bn3 def inverted_res(network, weight_map, input, lname, inch, outch, s, hidden, k, use_se, use_hs, w): use_res_connect = (s == 1 and inch == outch) if inch == hidden: conv = conv_seq_1(network, weight_map, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.") else: conv = conv_seq_2(network, weight_map, input, outch, hidden, k, s, use_se, use_hs, w, lname + "conv.") if not use_res_connect: return conv ew3 = network.add_elementwise(input, conv.get_output(0), trt.ElementWiseOperation.SUM) assert ew3 return ew3 def create_engine_small(max_batch_size, builder, config, dt): weight_map = load_weights(WEIGHT_PATH_SMALL) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data ew1 = conv_bn_h_swish(network, weight_map, data, 16, 3, 2, 1, "features.0.") ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 16, 16, 2, 16, 3, 1, 0, 56) ir2 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 72, 3, 0, 0, 28) ir3 = inverted_res(network, weight_map, ir2.get_output(0), "features.3.", 24, 24, 1, 88, 3, 0, 0, 28) ir4 = inverted_res(network, weight_map, ir3.get_output(0), "features.4.", 24, 40, 2, 96, 5, 1, 1, 14) ir5 = inverted_res(network, weight_map, ir4.get_output(0), "features.5.", 40, 40, 1, 240, 5, 1, 1, 14) ir6 = inverted_res(network, weight_map, ir5.get_output(0), "features.6.", 40, 40, 1, 240, 5, 1, 1, 14) ir7 = inverted_res(network, weight_map, ir6.get_output(0), "features.7.", 40, 48, 1, 120, 5, 1, 1, 14) ir8 = inverted_res(network, weight_map, ir7.get_output(0), "features.8.", 48, 48, 1, 144, 5, 1, 1, 14) ir9 = inverted_res(network, weight_map, ir8.get_output(0), "features.9.", 48, 96, 2, 288, 5, 1, 1, 7) ir10 = inverted_res(network, weight_map, ir9.get_output(0), "features.10.", 96, 96, 1, 576, 5, 1, 1, 7) ir11 = inverted_res(network, weight_map, ir10.get_output(0), "features.11.", 96, 96, 1, 576, 5, 1, 1, 7) ew2 = conv_bn_h_swish(network, weight_map, ir11.get_output(0), 576, 1, 1, 1, "conv.0.") se1 = add_se_layer(network, weight_map, ew2.get_output(0), 576, 7, "conv.1.") pool1 = network.add_pooling(input=se1.get_output(0), type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(7, 7)) assert pool1 pool1.stride_nd = (7, 7) sw1 = add_h_swish(network, pool1.get_output(0)) fc1 = network.add_fully_connected(input=sw1.get_output(0), num_outputs=1280, kernel=weight_map["classifier.0.weight"], bias=weight_map["classifier.0.bias"]) assert fc1 bn1 = add_batch_norm_2d(network, weight_map, fc1.get_output(0), "classifier.1", EPS) sw2 = add_h_swish(network, bn1.get_output(0)) fc2 = network.add_fully_connected(input=sw2.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map["classifier.3.weight"], bias=weight_map["classifier.3.bias"]) bn2 = add_batch_norm_2d(network, weight_map, fc2.get_output(0), "classifier.4", EPS) sw3 = add_h_swish(network, bn2.get_output(0)) sw3.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(sw3.get_output(0)) # Build Engine builder.max_batch_size = max_batch_size builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network del weight_map return engine def create_engine_large(max_batch_size, builder, config, dt): weight_map = load_weights(WEIGHT_PATH_SMALL) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data ew1 = conv_bn_h_swish(network, weight_map, data, 16, 3, 2, 1, "features.0.") ir1 = inverted_res(network, weight_map, ew1.get_output(0), "features.1.", 16, 16, 1, 16, 3, 0, 0, 112) ir2 = inverted_res(network, weight_map, ir1.get_output(0), "features.2.", 16, 24, 2, 64, 3, 0, 0, 56) ir3 = inverted_res(network, weight_map, ir2.get_output(0), "features.3.", 24, 24, 1, 72, 3, 0, 0, 56) ir4 = inverted_res(network, weight_map, ir3.get_output(0), "features.4.", 24, 40, 2, 72, 5, 1, 0, 28) ir5 = inverted_res(network, weight_map, ir4.get_output(0), "features.5.", 40, 40, 1, 120, 5, 1, 0, 28) ir6 = inverted_res(network, weight_map, ir5.get_output(0), "features.6.", 40, 40, 1, 120, 5, 1, 0, 28) ir7 = inverted_res(network, weight_map, ir6.get_output(0), "features.7.", 40, 80, 2, 240, 3, 0, 1, 14) ir8 = inverted_res(network, weight_map, ir7.get_output(0), "features.8.", 80, 80, 1, 200, 3, 0, 1, 14) ir9 = inverted_res(network, weight_map, ir8.get_output(0), "features.9.", 80, 80, 1, 184, 3, 0, 1, 14) ir10 = inverted_res(network, weight_map, ir9.get_output(0), "features.10.", 80, 80, 1, 184, 3, 0, 1, 14) ir11 = inverted_res(network, weight_map, ir10.get_output(0), "features.11.", 80, 112, 1, 480, 3, 1, 1, 14) ir12 = inverted_res(network, weight_map, ir11.get_output(0), "features.12.", 112, 112, 1, 672, 3, 1, 1, 14) ir13 = inverted_res(network, weight_map, ir12.get_output(0), "features.13.", 112, 160, 1, 672, 5, 1, 1, 14) ir14 = inverted_res(network, weight_map, ir13.get_output(0), "features.14.", 160, 160, 2, 672, 5, 1, 1, 7) ir15 = inverted_res(network, weight_map, ir14.get_output(0), "features.15.", 160, 160, 1, 960, 5, 1, 1, 7) ew2 = conv_bn_h_swish(network, weight_map, ir15.get_output(0), 960, 1, 1, 1, "conv.0.") pool1 = network.add_pooling(input=ew2.get_output(0), type=trt.PoolingType.AVERAGE, window_size=trt.DimsHW(7, 7)) assert pool1 pool1.stride_nd = (7, 7) sw1 = add_h_swish(network, pool1.get_output(0)) fc1 = network.add_fully_connected(input=sw1.get_output(0), num_outputs=1280, kernel=weight_map["classifier.0.weight"], bias=weight_map["classifier.0.bias"]) assert fc1 sw2 = add_h_swish(network, fc1.get_output(0)) fc2 = network.add_fully_connected(input=sw2.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map["classifier.3.weight"], bias=weight_map["classifier.3.bias"]) fc2.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(fc2.get_output(0)) # Build Engine builder.max_batch_size = max_batch_size builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network del weight_map return engine def API_to_model(max_batch_size, model_type): builder = trt.Builder(TRT_LOGGER) config = builder.create_builder_config() if model_type == "small": engine = create_engine_small(max_batch_size, builder, config, trt.float32) assert engine else: engine = create_engine_large(max_batch_size, builder, config, trt.float32) assert engine with open(ENGINE_PATH, "wb") as f: f.write(engine.serialize()) del engine del builder del config class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') parser.add_argument("-t", help='indicate small or large model') args = parser.parse_args() if not (args.s ^ args.d): print( "arguments not right!\n" "python mobilenet_v2.py -s # serialize model to plan file\n" "python mobilenet_v2.py -d # deserialize plan file and run inference" ) sys.exit() if args.s: API_to_model(BATCH_SIZE, args.t) else: runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32) inputs, outputs, bindings, stream = allocate_buffers(engine) inputs[0].host = data trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) print(f'Output: \n{trt_outputs[0][:10]}\n{trt_outputs[0][-10:]}') ================================================ FILE: psenet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(PSENet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB SOURCE_FILES "*.h" "*.cpp") add_executable(psenet ${SOURCE_FILES}) target_link_libraries(psenet nvinfer) target_link_libraries(psenet cudart) target_link_libraries(psenet ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: psenet/README.md ================================================ # PSENet **preprocessing + inference + postprocessing = 30ms** with fp32 on Tesla P40. The original Tensorflow implementation is [tensorflow_PSENet](https://github.com/liuheng92/tensorflow_PSENet). A TensorRT Python api implementation is [TensorRT-Python-PSENet](https://github.com/upczww/TensorRT-Python-PSENet). ## Key Features - Generating `.wts` from `Tensorflow`. - Dynamic batch and dynamic shape input. - Object-Oriented Programming. - Practice with C++ 11.

## How to Run * 1. generate .wts Download pretrained model from https://github.com/liuheng92/tensorflow_PSENet and put `model.ckpt.*` to `model` dir. Add a file `model/checkpoint` with content ``` model_checkpoint_path: "model.ckpt" all_model_checkpoint_paths: "model.ckpt" ``` Then run ``` python gen_tf_wts.py ``` which will gengerate a `psenet.wts`. * 2. cmake and make ``` mkdir build cd build cmake .. make ``` * 3. build engine and run detection ``` cp ../psenet.wts ./ cp ../test.jpg ./ ./psenet -s // serialize model to plan file ./psenet -d // deserialize plan file and run inference ``` ## Known Issues None ## Todo * use `ExponentialMovingAverage` weight. ================================================ FILE: psenet/gen_tf_wts.py ================================================ from sys import prefix import tensorflow as tf from tensorflow.python import pywrap_tensorflow import numpy as np import struct model_dir = "model" ckpt = tf.train.get_checkpoint_state(model_dir) ckpt_path = ckpt.model_checkpoint_path reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path) param_dict = reader.get_variable_to_shape_map() f = open(r"psenet.wts", "w") keys = param_dict.keys() f.write("{}\n".format(len(keys))) for key in keys: weight = reader.get_tensor(key) print(key, weight.shape) if len(weight.shape) == 4: weight = np.transpose(weight, (3, 2, 0, 1)) print(weight.shape) weight = np.reshape(weight, -1) f.write("{} {} ".format(key, len(weight))) for w in weight: f.write(" ") f.write(struct.pack(">f", float(w)).hex()) f.write("\n") ================================================ FILE: psenet/layers.cpp ================================================ #include "layers.h" IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + "gamma"].values; // scale float* beta = (float*)weightMap[lname + "beta"].values; // offset float* mean = (float*)weightMap[lname + "moving_mean"].values; float* var = (float*)weightMap[lname + "moving_variance"].values; int len = weightMap[lname + "moving_variance"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (auto i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (auto i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (auto i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* bottleneck(INetworkDefinition* network, std::map& weightMap, ITensor& input, int ch, int stride, std::string lname, int branch_type) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, ch, DimsHW{ 1, 1 }, weightMap[lname + "conv1/weights"], emptywts); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "conv1/BatchNorm/", 1e-5); assert(bn1); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), ch, DimsHW{ 3, 3 }, weightMap[lname + "conv2/weights"], emptywts); conv2->setStrideNd(DimsHW{ stride, stride }); conv2->setPaddingNd(DimsHW{ 1, 1 }); assert(conv2); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "conv2/BatchNorm/", 1e-5); assert(bn2); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), ch * 4, DimsHW{ 1, 1 }, weightMap[lname + "conv3/weights"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "conv3/BatchNorm/", 1e-5); assert(bn3); IElementWiseLayer* ew1; // branch_type 0:shortcut,1:conv+bn+shortcut,2:maxpool+shortcut if (branch_type == 0) { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); } else if (branch_type == 1) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, ch * 4, DimsHW{ 1, 1 }, weightMap[lname + "shortcut/weights"], emptywts); conv4->setStrideNd(DimsHW{ stride, stride }); assert(conv4); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "shortcut/BatchNorm/", 1e-5); assert(bn4); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); } else { IPoolingLayer* pool = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{ 1, 1 }); pool->setStrideNd(DimsHW{ 2, 2 }); assert(pool); ew1 = network->addElementWise(*pool->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } IActivationLayer* addConvRelu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int kernel, int stride, std::string lname) { IConvolutionLayer* conv = network->addConvolutionNd(input, 256, DimsHW{ kernel, kernel }, weightMap[lname + "weights"], weightMap[lname + "biases"]); conv->setStrideNd(DimsHW{ stride, stride }); if (kernel == 3) { conv->setPaddingNd(DimsHW{ 1, 1 }); } assert(conv); IActivationLayer* ac = network->addActivation(*conv->getOutput(0), ActivationType::kRELU); assert(ac); return ac; } ================================================ FILE: psenet/layers.h ================================================ #ifndef TENSORRTX_LAYERS_H #define TENSORRTX_LAYERS_H #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" using namespace nvinfer1; IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map &weightMap, ITensor &input, std::string lname, float eps); IActivationLayer *bottleneck(INetworkDefinition *network, std::map &weightMap, ITensor &input, int ch, int stride, std::string lname, int branch_type); IActivationLayer *addConvRelu(INetworkDefinition *network, std::map &weightMap, ITensor &input, int outch, int kernel, int stride, std::string lname); #endif ================================================ FILE: psenet/main.cpp ================================================ #include "psenet.h" int main(int argc, char** argv) { PSENet psenet(1200, 640, 0.90, 6, 4); if (argc == 2 && std::string(argv[1]) == "-s") { std::cout << "Serializling Engine" << std::endl; psenet.serializeEngine(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { psenet.init(); std::vector files; for (int i = 0; i < 10; i++) files.emplace_back("test.jpg"); for (auto file : files) { std::cout << "Detect " << file << std::endl; psenet.detect(file); } return 0; } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./psenet -s // serialize model to plan file" << std::endl; std::cerr << "./psenet -d // deserialize plan file and run inference" << std::endl; return -1; } } ================================================ FILE: psenet/psenet.cpp ================================================ #include "psenet.h" #include #include #define MAX_INPUT_SIZE 1200 #define MIN_INPUT_SIZE 128 #define OPT_INPUT_W 640 #define OPT_INPUT_H 640 PSENet::PSENet(int max_side_len, int min_side_len, float threshold, int num_kernel, int stride) : max_side_len_(max_side_len), min_side_len_(min_side_len), post_threshold_(threshold), num_kernels_(num_kernel), stride_(stride) { } PSENet::~PSENet() { } // create the engine using only the API and not any parser. ICudaEngine* PSENet::createEngine(IBuilder* builder, IBuilderConfig* config) { std::map weightMap = loadWeights("./psenet.wts"); Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); INetworkDefinition* network = builder->createNetworkV2(explicitBatch); ITensor* data = network->addInput(input_name_, dt, Dims4{ -1, 3, -1, -1 }); assert(data); IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7, 7 }, weightMap["resnet_v1_50/conv1/weights"], emptywts); conv1->setStrideNd(DimsHW{ 2, 2 }); conv1->setPaddingNd(DimsHW{ 3, 3 }); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "resnet_v1_50/conv1/BatchNorm/", 1e-5); assert(bn1); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // C2 IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 }); pool1->setStrideNd(DimsHW{ 2, 2 }); pool1->setPrePadding(DimsHW{ 0, 0 }); pool1->setPostPadding(DimsHW{ 1, 1 }); assert(pool1); IActivationLayer* x; x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 1, "resnet_v1_50/block1/unit_1/bottleneck_v1/", 1); x = bottleneck(network, weightMap, *x->getOutput(0), 64, 1, "resnet_v1_50/block1/unit_2/bottleneck_v1/", 0); // C3 IActivationLayer* block1 = bottleneck(network, weightMap, *x->getOutput(0), 64, 2, "resnet_v1_50/block1/unit_3/bottleneck_v1/", 2); x = bottleneck(network, weightMap, *block1->getOutput(0), 128, 1, "resnet_v1_50/block2/unit_1/bottleneck_v1/", 1); x = bottleneck(network, weightMap, *x->getOutput(0), 128, 1, "resnet_v1_50/block2/unit_2/bottleneck_v1/", 0); x = bottleneck(network, weightMap, *x->getOutput(0), 128, 1, "resnet_v1_50/block2/unit_3/bottleneck_v1/", 0); // C4 IActivationLayer* block2 = bottleneck(network, weightMap, *x->getOutput(0), 128, 2, "resnet_v1_50/block2/unit_4/bottleneck_v1/", 2); x = bottleneck(network, weightMap, *block2->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_1/bottleneck_v1/", 1); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_2/bottleneck_v1/", 0); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_3/bottleneck_v1/", 0); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_4/bottleneck_v1/", 0); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 1, "resnet_v1_50/block3/unit_5/bottleneck_v1/", 0); IActivationLayer* block3 = bottleneck(network, weightMap, *x->getOutput(0), 256, 2, "resnet_v1_50/block3/unit_6/bottleneck_v1/", 2); x = bottleneck(network, weightMap, *block3->getOutput(0), 512, 1, "resnet_v1_50/block4/unit_1/bottleneck_v1/", 1); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 1, "resnet_v1_50/block4/unit_2/bottleneck_v1/", 0); // C5 IActivationLayer* block4 = bottleneck(network, weightMap, *x->getOutput(0), 512, 1, "resnet_v1_50/block4/unit_3/bottleneck_v1/", 0); IActivationLayer* build_p5_r1 = addConvRelu(network, weightMap, *block4->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P5/"); assert(build_p5_r1); IActivationLayer* build_p4_r1 = addConvRelu(network, weightMap, *block2->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P4/reduce_dimension/"); assert(build_p4_r1); IResizeLayer* bfp_layer4_resize = network->addResize(*build_p5_r1->getOutput(0)); auto build_p4_r1_shape = network->addShape(*build_p4_r1->getOutput(0))->getOutput(0); bfp_layer4_resize->setInput(1, *build_p4_r1_shape); bfp_layer4_resize->setResizeMode(ResizeMode::kNEAREST); bfp_layer4_resize->setAlignCorners(false); assert(bfp_layer4_resize); IElementWiseLayer* bfp_add = network->addElementWise(*bfp_layer4_resize->getOutput(0), *build_p4_r1->getOutput(0), ElementWiseOperation::kSUM); assert(bfp_add); IActivationLayer* build_p4_r2 = addConvRelu(network, weightMap, *bfp_add->getOutput(0), 256, 3, 1, "build_feature_pyramid/build_P4/avoid_aliasing/"); assert(build_p4_r2); IActivationLayer* build_p3_r1 = addConvRelu(network, weightMap, *block1->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P3/reduce_dimension/"); assert(build_p3_r1); IResizeLayer* bfp_layer3_resize = network->addResize(*build_p4_r2->getOutput(0)); bfp_layer3_resize->setResizeMode(ResizeMode::kNEAREST); auto build_p3_r1_shape = network->addShape(*build_p3_r1->getOutput(0))->getOutput(0); bfp_layer3_resize->setInput(1, *build_p3_r1_shape); bfp_layer3_resize->setAlignCorners(false); assert(bfp_layer3_resize); IElementWiseLayer* bfp_add1 = network->addElementWise(*bfp_layer3_resize->getOutput(0), *build_p3_r1->getOutput(0), ElementWiseOperation::kSUM); assert(bfp_add1); IActivationLayer* build_p3_r2 = addConvRelu(network, weightMap, *bfp_add1->getOutput(0), 256, 3, 1, "build_feature_pyramid/build_P3/avoid_aliasing/"); assert(build_p3_r2); IActivationLayer* build_p2_r1 = addConvRelu(network, weightMap, *pool1->getOutput(0), 256, 1, 1, "build_feature_pyramid/build_P2/reduce_dimension/"); assert(build_p2_r1); IResizeLayer* bfp_layer2_resize = network->addResize(*build_p3_r2->getOutput(0)); bfp_layer2_resize->setResizeMode(ResizeMode::kNEAREST); auto build_p2_r1_shape = network->addShape(*build_p2_r1->getOutput(0))->getOutput(0); bfp_layer2_resize->setInput(1, *build_p2_r1_shape); bfp_layer2_resize->setAlignCorners(false); assert(bfp_layer2_resize); IElementWiseLayer* bfp_add2 = network->addElementWise(*bfp_layer2_resize->getOutput(0), *build_p2_r1->getOutput(0), ElementWiseOperation::kSUM); assert(bfp_add2); // P2 IActivationLayer* build_p2_r2 = addConvRelu(network, weightMap, *bfp_add2->getOutput(0), 256, 3, 1, "build_feature_pyramid/build_P2/avoid_aliasing/"); assert(build_p2_r2); auto build_p2_r2_shape = network->addShape(*build_p2_r2->getOutput(0))->getOutput(0); // P3 x2 IResizeLayer* layer1_resize = network->addResize(*build_p3_r2->getOutput(0)); layer1_resize->setResizeMode(ResizeMode::kLINEAR); layer1_resize->setInput(1, *build_p2_r2_shape); layer1_resize->setAlignCorners(false); assert(layer1_resize); // P4 x4 IResizeLayer* layer2_resize = network->addResize(*build_p4_r2->getOutput(0)); layer2_resize->setResizeMode(ResizeMode::kLINEAR); layer2_resize->setInput(1, *build_p2_r2_shape); layer2_resize->setAlignCorners(false); assert(layer2_resize); // P5 x8 IResizeLayer* layer3_resize = network->addResize(*build_p5_r1->getOutput(0)); layer3_resize->setResizeMode(ResizeMode::kLINEAR); layer3_resize->setInput(1, *build_p2_r2_shape); layer3_resize->setAlignCorners(false); assert(layer3_resize); // C(P5,P4,P3,P2) ITensor* inputTensors[] = { layer3_resize->getOutput(0), layer2_resize->getOutput(0), layer1_resize->getOutput(0), build_p2_r2->getOutput(0) }; IConcatenationLayer* concat = network->addConcatenation(inputTensors, 4); assert(concat); IConvolutionLayer* feature_result_conv = network->addConvolutionNd(*concat->getOutput(0), 256, DimsHW{ 3, 3 }, weightMap["feature_results/Conv/weights"], emptywts); feature_result_conv->setPaddingNd(DimsHW{ 1, 1 }); assert(feature_result_conv); IScaleLayer* feature_result_bn = addBatchNorm2d(network, weightMap, *feature_result_conv->getOutput(0), "feature_results/Conv/BatchNorm/", 1e-5); assert(feature_result_bn); IActivationLayer* feature_result_relu = network->addActivation(*feature_result_bn->getOutput(0), ActivationType::kRELU); assert(feature_result_relu); IConvolutionLayer* feature_result_conv_1 = network->addConvolutionNd(*feature_result_relu->getOutput(0), 6, DimsHW{ 1, 1 }, weightMap["feature_results/Conv_1/weights"], weightMap["feature_results/Conv_1/biases"]); assert(feature_result_conv_1); IActivationLayer* sigmoid = network->addActivation(*feature_result_conv_1->getOutput(0), ActivationType::kSIGMOID); assert(sigmoid); sigmoid->getOutput(0)->setName(output_name_); std::cout << "Set name out" << std::endl; network->markOutput(*sigmoid->getOutput(0)); // Set profile IOptimizationProfile* profile = builder->createOptimizationProfile(); profile->setDimensions(input_name_, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE)); profile->setDimensions(input_name_, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W)); profile->setDimensions(input_name_, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE)); config->addOptimizationProfile(profile); // Build engine config->setMaxWorkspaceSize(1 << 30); // 1G #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); ; std::cout << "Build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void PSENet::serializeEngine() { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(builder, config); assert(engine != nullptr); // Serialize the engine IHostMemory* modelStream{ nullptr }; modelStream = engine->serialize(); assert(modelStream != nullptr); std::ofstream p("./psenet.engine", std::ios::binary | std::ios::out); if (!p) { std::cerr << "Could not open plan output file" << std::endl; return; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); return; } void PSENet::deserializeEngine() { std::ifstream file("./psenet.engine", std::ios::binary | std::ios::in); if (file.good()) { file.seekg(0, file.end); size_t size = file.tellg(); file.seekg(0, file.beg); char* trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); mCudaEngine = std::shared_ptr(mRuntime->deserializeCudaEngine(trtModelStream, size), InferDeleter()); assert(mCudaEngine != nullptr); } } void PSENet::inferenceOnce(IExecutionContext& context, float* input, float* output, int input_h, int input_w) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(input_name_); const int outputIndex = engine.getBindingIndex(output_name_); context.setBindingDimensions(inputIndex, Dims4(1, 3, input_h, input_w)); int input_size = 3 * input_h * input_w * sizeof(float); int output_size = input_h * input_w * 6 / 16 * sizeof(float); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], input_size)); CHECK(cudaMalloc(&buffers[outputIndex], output_size)); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, input_size, cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size, cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } void PSENet::init() { mRuntime = std::shared_ptr(createInferRuntime(gLogger), InferDeleter()); assert(mRuntime != nullptr); std::cout << "Deserialize Engine" << std::endl; deserializeEngine(); mContext = std::shared_ptr(mCudaEngine->createExecutionContext(), InferDeleter()); assert(mContext != nullptr); mContext->setOptimizationProfile(0); std::cout << "Finished init" << std::endl; } void PSENet::detect(std::string image_path) { // Run inference cv::Mat image = cv::imread(image_path); int resize_h, resize_w; float ratio_h, ratio_w; auto start = std::chrono::system_clock::now(); float* input = preProcess(image, resize_h, resize_w, ratio_h, ratio_w); float* output = new float[resize_h * resize_w * 6 / 16]; inferenceOnce(*mContext, input, output, resize_h, resize_w); std::vector boxes = postProcess(output, resize_h, resize_w); drawRects(image, boxes, stride_, ratio_h, ratio_w, 1.0); auto end = std::chrono::system_clock::now(); cv::imwrite("result_" + image_path, image); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; delete input; delete output; } float* PSENet::preProcess(cv::Mat image, int& resize_h, int& resize_w, float& ratio_h, float& ratio_w) { cv::Mat imageRGB; cv::cvtColor(image, imageRGB, cv::COLOR_BGR2RGB); cv::Mat imageProcessed; int h = imageRGB.size().height; int w = imageRGB.size().width; resize_w = w; resize_h = h; float ratio = 1.0; // limit the max side and min side if (resize_h > max_side_len_ || resize_w > max_side_len_) { if (resize_h > resize_w) ratio = float(max_side_len_) / float(resize_h); else ratio = float(max_side_len_) / float(resize_w); } if (resize_h < min_side_len_ || resize_w < min_side_len_) { if (resize_h < resize_w) ratio = float(min_side_len_) / float(resize_h); else ratio = float(min_side_len_) / float(resize_w); } resize_h = int(resize_h * ratio); resize_w = int(resize_w * ratio); if (resize_h % 32 != 0) resize_h = (resize_h / 32 + 1) * 32; if (resize_w % 32 != 0) resize_w = (resize_w / 32 + 1) * 32; ratio_h = resize_h / float(h); ratio_w = resize_w / float(w); cv::resize(imageRGB, imageProcessed, cv::Size(resize_w, resize_h)); float* input = new float[3 * resize_h * resize_w]; cv::Mat imgFloat; imageProcessed.convertTo(imgFloat, CV_32FC3); cv::subtract(imgFloat, cv::Scalar(123.68, 116.78, 103.94), imgFloat, cv::noArray(), -1); std::vector chw; for (auto i = 0; i < 3; ++i) chw.emplace_back(cv::Mat(cv::Size(resize_w, resize_h), CV_32FC1, input + i * resize_w * resize_h)); cv::split(imgFloat, chw); return input; } std::vector PSENet::postProcess(float* origin_output, int resize_h, int resize_w) { // BxCxHxW S0 ===> S5 small ===> large const int h = resize_h / stride_; const int w = resize_w / stride_; const int length = h * w; // get kernels, sequence: 0->n, max -> min std::vector kernels(num_kernels_); for (auto i = num_kernels_ - 1; i >= 0; --i) { cv::Mat tmp_kernel(h, w, CV_32FC1, (void*)(origin_output + i * length), 0); cv::threshold(tmp_kernel, tmp_kernel, post_threshold_, 255, cv::THRESH_BINARY); tmp_kernel.convertTo(tmp_kernel, CV_8UC1); assert(tmp_kernel.rows == h && tmp_kernel.cols == w); kernels[num_kernels_ - 1 - i] = tmp_kernel; } cv::Mat stats, centroids, label_image; int label_num = cv::connectedComponents(kernels[num_kernels_ - 1], label_image, 4); label_image.convertTo(label_image, CV_8U); assert(label_image.rows == h && label_image.cols == w); cv::Mat out = cv::Mat::zeros(h, w, CV_8UC1); std::queue> q; std::queue> next_q; for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { auto label = *label_image.ptr(i, j); if (label > 0) { q.push(std::make_tuple(i, j, label)); *out.ptr(i, j) = label; } } } int dx[4] = { -1, 1, 0, 0 }; int dy[4] = { 0, 0, -1, 1 }; for (int i = num_kernels_ - 2; i >= 0; i--) { //get each kernels auto kernel = kernels[i]; while (!q.empty()) { //get each queue menber in q auto q_n = q.front(); q.pop(); int y = std::get<0>(q_n); //i int x = std::get<1>(q_n); //j int l = std::get<2>(q_n); //label //store the edge pixel after one expansion bool is_edge = true; for (int idx = 0; idx < 4; idx++) { int index_y = y + dy[idx]; int index_x = x + dx[idx]; if (index_y < 0 || index_y >= h || index_x < 0 || index_x >= w) continue; if (!*kernel.ptr(index_y, index_x) || *out.ptr(index_y, index_x) > 0) continue; q.push(std::make_tuple(index_y, index_x, l)); *out.ptr(index_y, index_x) = l; is_edge = false; } if (is_edge) { next_q.push(std::make_tuple(y, x, l)); } } std::swap(q, next_q); } std::vector boxes; for (auto n = 1; n < label_num; ++n) { std::vector points; cv::findNonZero(out == n, points); cv::Mat fuck = out == n; cv::RotatedRect rect = cv::minAreaRect(points); boxes.emplace_back(rect); } return boxes; } ================================================ FILE: psenet/psenet.h ================================================ #ifndef TENSORRTX_PSENET_H #define TENSORRTX_PSENET_H #include #include #include #include #include "utils.h" #include "layers.h" class PSENet { public: PSENet(int max_side_len, int min_side_len, float threshold, int num_kernel, int stride); ~PSENet(); ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config); void serializeEngine(); void deserializeEngine(); void init(); void inferenceOnce(IExecutionContext& context, float* input, float* output, int input_h, int input_w); void detect(std::string image_path); float* preProcess(cv::Mat image, int& resize_h, int& resize_w, float& ratio_h, float& ratio_w); std::vector postProcess(float* origin_output, int resize_h, int resize_w); private: Logger gLogger; std::shared_ptr mRuntime; std::shared_ptr mCudaEngine; std::shared_ptr mContext; DataType dt = DataType::kFLOAT; const char* input_name_ = "input"; const char* output_name_ = "output"; int max_side_len_ = 1024; int min_side_len_ = 640; float post_threshold_ = 0.9; int num_kernels_ = 6; int stride_ = 4; }; #endif // TENSORRTX_PSENET_H ================================================ FILE: psenet/utils.cpp ================================================ #include "utils.h" // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::cout << "Model weight is large, it will take some time." << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } std::cout << "Finish load weight" << std::endl; return weightMap; } cv::RotatedRect expandBox(const cv::RotatedRect& inBox, float ratio) { cv::Size size = inBox.size; int neww = int(size.width * ratio); int newh = int(size.height * ratio); return cv::RotatedRect(inBox.center, cv::Size(neww, newh), inBox.angle); } void drawRects(cv::Mat& image, std::vector boxes, float stride, float ratio_h, float ratio_w, float expand_ratio) { cv::Point2f rect[4]; for (unsigned int i = 0; i < boxes.size(); i++) { cv::RotatedRect box = boxes[i]; cv::RotatedRect expandbox = expandBox(box, expand_ratio); expandbox.points(rect); for (auto j = 0; j < 4; j++) { cv::line(image, cv::Point{ int(rect[j].x / ratio_w * stride), int(rect[j].y / ratio_h * stride) }, cv::Point{ int(rect[(j + 1) % 4].x / ratio_w * stride), int(rect[(j + 1) % 4].y / ratio_h * stride) }, cv::Scalar(0, 0, 255), 2, 8); } } } ================================================ FILE: psenet/utils.h ================================================ #ifndef TENSORRTX_UTILS_H #define TENSORRTX_UTILS_H #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "assert.h" #include using namespace nvinfer1; std::map loadWeights(const std::string file); cv::RotatedRect expandBox(const cv::RotatedRect& inBox, float ratio = 1.0); void drawRects(cv::Mat& image, std::vector boxes, float stride, float ratio_h, float ratio_w, float expand_ratio); cv::Mat renderSegment(cv::Mat image, const cv::Mat& mask); // <============== Operator =============> struct InferDeleter { template void operator()(T* obj) const { if (obj) { obj->destroy(); } } }; #define CHECK(status) \ do \ { \ auto ret = (status); \ if (ret != 0) \ { \ std::cout << "Cuda failure: " << ret; \ abort(); \ } \ } while (0) // Logger for TensorRT info/warning/errors class Logger : public nvinfer1::ILogger { public: Logger() : Logger(Severity::kWARNING) {} Logger(Severity severity) : reportableSeverity(severity) {} void log(Severity severity, const char* msg) override { // suppress messages with severity enum value greater than the reportable if (severity > reportableSeverity) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cerr << "INFO: "; break; default: std::cerr << "UNKNOWN: "; break; } std::cerr << msg << std::endl; } Severity reportableSeverity{ Severity::kWARNING }; }; #endif ================================================ FILE: rcnn/BatchedNms.cu ================================================ #include #include #include #include #include #include #include #include #include #include #include #include "BatchedNmsPlugin.h" #include "./cuda_utils.h" #include "macros.h" #ifdef CUDA_11 #include #include #else #include #include namespace cub = thrust::cuda_cub::cub; #endif namespace nvinfer1 { __global__ void batched_nms_kernel( const int nms_method, const float threshold, const int num_detections, const int *indices, float *scores, const float *classes, const float4 *boxes) { // Go through detections by descending score for (int m = 0; m < num_detections; m++) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < num_detections && m < i && scores[m] > 0.0f) { int idx = indices[i]; int max_idx = indices[m]; int icls = classes[idx]; int mcls = classes[max_idx]; if (mcls == icls) { float4 ibox = boxes[idx]; float4 mbox = boxes[max_idx]; float x1 = max(ibox.x, mbox.x); float y1 = max(ibox.y, mbox.y); float x2 = min(ibox.z, mbox.z); float y2 = min(ibox.w, mbox.w); float w = max(0.0f, x2 - x1); float h = max(0.0f, y2 - y1); float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y); float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y); float inter = w * h; float overlap = inter / (iarea + marea - inter); float sigma = 0.5; // this is an empirical value // printf("nms_method: %d", nms_method); //nms methods selection in the second stage // 0: original nms // 1: soft-nms (linear) // 2: soft-nms (gaussian) // printf("nms_method: ", nms_method); switch (nms_method) { case 0: if (overlap > threshold) { scores[i] = 0.0f; } break; case 1: if (overlap > threshold) { scores[i] = (1 - overlap) * scores[i]; } break; case 2: if (overlap > threshold) { scores[i] = std::exp(-(overlap * overlap) / sigma) * scores[i]; } break; default: if (overlap > threshold) { scores[i] = 0.0f; } break; } } } // Sync discarded detections __syncthreads(); } } int batchedNms(int nms_method, int batch_size, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) { if (!workspace || !workspace_size) { // Return required scratch space size cub style workspace_size += get_size_aligned(count); // indices workspace_size += get_size_aligned(count); // indices_sorted workspace_size += get_size_aligned(count); // scores_sorted size_t temp_size_sort = 0; cub::DeviceRadixSort::SortPairsDescending( static_cast(nullptr), temp_size_sort, static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), count); workspace_size += temp_size_sort; return workspace_size; } auto on_stream = thrust::cuda::par.on(stream); auto indices = get_next_ptr(count, workspace, workspace_size); std::vector indices_h(count); for (int i = 0; i < count; i++) indices_h[i] = i; cudaMemcpyAsync(indices, indices_h.data(), count * sizeof * indices, cudaMemcpyHostToDevice, stream); auto indices_sorted = get_next_ptr(count, workspace, workspace_size); auto scores_sorted = get_next_ptr(count, workspace, workspace_size); for (int batch = 0; batch < batch_size; batch++) { auto in_scores = static_cast(inputs[0]) + batch * count; auto in_boxes = static_cast(inputs[1]) + batch * count; auto in_classes = static_cast(inputs[2]) + batch * count; auto out_scores = static_cast(outputs[0]) + batch * detections_per_im; auto out_boxes = static_cast(outputs[1]) + batch * detections_per_im; auto out_classes = static_cast(outputs[2]) + batch * detections_per_im; // Sort scores and corresponding indices int num_detections = count; cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size, in_scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores_sorted) * 8, stream); // Launch actual NMS kernel - 1 block with each thread handling n detections // TODO: different device has differnet max threads const int max_threads = 1024; int num_per_thread = ceil(static_cast(num_detections) / max_threads); batched_nms_kernel << > > (nms_method, nms_thresh, num_detections, indices_sorted, scores_sorted, in_classes, in_boxes); // Re-sort with updated scores cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size, scores_sorted, scores_sorted, indices_sorted, indices, num_detections, 0, sizeof(*scores_sorted) * 8, stream); // Gather filtered scores, boxes, classes num_detections = min(detections_per_im, num_detections); cudaMemcpyAsync(out_scores, scores_sorted, num_detections * sizeof *scores_sorted, cudaMemcpyDeviceToDevice, stream); if (num_detections < detections_per_im) { thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0); } thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes); thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes); } return 0; } } // namespace nvinfer1 ================================================ FILE: rcnn/BatchedNmsPlugin.h ================================================ #pragma once #include #include #include #include "macros.h" using namespace nvinfer1; #define PLUGIN_NAME "BatchedNms" #define PLUGIN_VERSION "1" #define PLUGIN_NAMESPACE "" namespace nvinfer1 { int batchedNms(int nms_method, int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream); /* input1: scores{C, 1} C->topk input2: boxes{C, 4} C->topk format:XYXY input3: classes{C, 1} C->topk output1: scores{C, 1} C->detections_per_img output2: boxes{C, 4} C->detections_per_img format:XYXY output3: classes{C, 1} C->detections_per_img Description: implement batched nms */ class BatchedNmsPlugin : public IPluginV2Ext { int _nms_method; float _nms_thresh; int _detections_per_im; size_t _count = 1; protected: void deserialize(void const* data, size_t length) { const char* d = static_cast(data); read(d, _nms_method); read(d, _nms_thresh); read(d, _detections_per_im); read(d, _count); } size_t getSerializationSize() const override { return sizeof(_nms_method) + sizeof(_nms_thresh) + sizeof(_detections_per_im) + sizeof(_count); } void serialize(void *buffer) const TRT_NOEXCEPT override { char* d = static_cast(buffer); write(d, _nms_method); write(d, _nms_thresh); write(d, _detections_per_im); write(d, _count); } public: BatchedNmsPlugin(int nms_method, float nms_thresh, int detections_per_im) : _nms_method(nms_method), _nms_thresh(nms_thresh), _detections_per_im(detections_per_im) { assert(nms_method >= 0); assert(nms_thresh > 0); assert(detections_per_im > 0); } BatchedNmsPlugin(int nms_method, float nms_thresh, int detections_per_im, size_t count) : _nms_method(nms_method), _nms_thresh(nms_thresh), _detections_per_im(detections_per_im), _count(count) { assert(nms_method >= 0); assert(nms_thresh > 0); assert(detections_per_im > 0); assert(count > 0); } BatchedNmsPlugin(void const* data, size_t length) { this->deserialize(data, length); } const char *getPluginType() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } int getNbOutputs() const TRT_NOEXCEPT override { return 3; } Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override { assert(nbInputDims == 3); assert(index < this->getNbOutputs()); return Dims2(_detections_per_im, index == 1 ? 4 : 1); } bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override { return type == DataType::kFLOAT && format == PluginFormat::kLINEAR; } int initialize() TRT_NOEXCEPT override { return 0; } void terminate() TRT_NOEXCEPT override {} size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { static int size = -1; if (size < 0) { size = batchedNms(_nms_method, maxBatchSize, nullptr, nullptr, _count, _detections_per_im, _nms_thresh, nullptr, 0, nullptr); } return size; } int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override { return batchedNms(_nms_method, batchSize, inputs, outputs, _count, _detections_per_im, _nms_thresh, workspace, getWorkspaceSize(batchSize), stream); } void destroy() TRT_NOEXCEPT override { delete this; } const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } void setPluginNamespace(const char *N) TRT_NOEXCEPT override { } // IPluginV2Ext Methods DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override { assert(index < 3); return DataType::kFLOAT; } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; } void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override { assert(*inputTypes == nvinfer1::DataType::kFLOAT && floatFormat == nvinfer1::PluginFormat::kLINEAR); assert(nbInputs == 3); assert(inputDims[0].d[0] == inputDims[2].d[0]); assert(inputDims[1].d[0] == inputDims[2].d[0]); _count = inputDims[0].d[0]; } IPluginV2Ext *clone() const TRT_NOEXCEPT override { return new BatchedNmsPlugin(_nms_method, _nms_thresh, _detections_per_im, _count); } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } }; class BatchedNmsPluginCreator : public IPluginCreator { public: BatchedNmsPluginCreator() {} const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } const char *getPluginName() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override { return new BatchedNmsPlugin(serialData, serialLength); } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; } IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; } }; REGISTER_TENSORRT_PLUGIN(BatchedNmsPluginCreator); } // namespace nvinfer1 #undef PLUGIN_NAME #undef PLUGIN_VERSION #undef PLUGIN_NAMESPACE ================================================ FILE: rcnn/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.1) project(rcnn) add_definitions(-std=c++14) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 14) set(CMAKE_BUILD_TYPE Debug) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--extended-lambda) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/home/jushi/TensorRT-8.2.1.6/include) link_directories(/home/jushi/TensorRT-8.2.1.6/lib) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/BatchedNms.cu ${PROJECT_SOURCE_DIR}/PredictorDecode.cu ${PROJECT_SOURCE_DIR}/RoiAlign.cu ${PROJECT_SOURCE_DIR}/RpnDecode.cu ${PROJECT_SOURCE_DIR}/RpnNms.cu ${PROJECT_SOURCE_DIR}/MaskRcnnInference.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(rcnn ${PROJECT_SOURCE_DIR}/rcnn.cpp) target_link_libraries(rcnn nvinfer) target_link_libraries(rcnn cudart) target_link_libraries(rcnn myplugins) target_link_libraries(rcnn ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: rcnn/MaskRcnnInference.cu ================================================ #include "MaskRcnnInferencePlugin.h" #include "macros.h" namespace nvinfer1 { __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); } __global__ void MaskRcnnInferenceKernel( const int nthreads, const int detections_per_im, const int output_size, const int num_classes, const float* indices, const float* masks, float* out_masks) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int ind = index / output_size / output_size / num_classes; int ind_class = indices[ind]; int cur_class = index / output_size / output_size % num_classes; if (ind_class == cur_class) { int w = index % output_size; int h = index / output_size % output_size; int tmp = ind * num_classes * output_size * output_size + cur_class * output_size*output_size + h * output_size + w; float maskVal = masks[ind * num_classes * output_size * output_size + cur_class * output_size * output_size + h * output_size + w]; out_masks[ind * output_size * output_size + h * output_size + w] = Logist(maskVal); } } } int maskRcnnInference(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, int detections_per_im, int output_size, int num_classes, cudaStream_t stream) { for (int batch = 0; batch < batchSize; batch++) { auto in_indices = static_cast(inputs[0]) + batch * detections_per_im; auto in_masks = static_cast(inputs[1]) + batch * detections_per_im * num_classes * output_size * output_size; auto out_masks = static_cast(outputs[0]) + batch * detections_per_im * output_size * output_size; int nthreads = detections_per_im * num_classes * output_size * output_size; const int max_threads = 1024; int blocksPerGrid = ceil(static_cast(nthreads) / max_threads); // TODO: can implement this function with thrust? MaskRcnnInferenceKernel << > > ( nthreads, detections_per_im, output_size, num_classes, in_indices, in_masks, out_masks); cudaDeviceSynchronize(); } return 0; } } // namespace nvinfer1 ================================================ FILE: rcnn/MaskRcnnInferencePlugin.h ================================================ #pragma once #include #include #include #include "macros.h" using namespace nvinfer1; #define PLUGIN_NAME "MaskRcnnInference" #define PLUGIN_VERSION "1" #define PLUGIN_NAMESPACE "" namespace nvinfer1 { int maskRcnnInference(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, int detections_per_im, int output_size, int num_classes, cudaStream_t stream); /* input1: indices{C, 1} C->topk input2: masks{C, NUM_CLASS, size, size} C->topk format:XYXY output1: masks{C, 1, size, size} C->detections_per_img Description: implement index select */ class MaskRcnnInferencePlugin : public IPluginV2Ext { int _detections_per_im; int _output_size; int _num_classes = 1; protected: void deserialize(void const* data, size_t length) { const char* d = static_cast(data); read(d, _detections_per_im); read(d, _output_size); read(d, _num_classes); } size_t getSerializationSize() const TRT_NOEXCEPT override { return sizeof(_detections_per_im) + sizeof(_output_size) + sizeof(_num_classes); } void serialize(void *buffer) const TRT_NOEXCEPT override { char* d = static_cast(buffer); write(d, _detections_per_im); write(d, _output_size); write(d, _num_classes); } public: MaskRcnnInferencePlugin(int detections_per_im, int output_size) : _detections_per_im(detections_per_im), _output_size(output_size) { assert(detections_per_im > 0); assert(output_size > 0); } MaskRcnnInferencePlugin(int detections_per_im, int output_size, int num_classes) : _detections_per_im(detections_per_im), _output_size(output_size), _num_classes(num_classes) { assert(detections_per_im > 0); assert(output_size > 0); assert(num_classes > 0); } MaskRcnnInferencePlugin(void const* data, size_t length) { this->deserialize(data, length); } const char *getPluginType() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override { assert(index < this->getNbOutputs()); return Dims4(_detections_per_im, 1, _output_size, _output_size); } bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override { return type == DataType::kFLOAT && format == PluginFormat::kLINEAR; } int initialize() TRT_NOEXCEPT override { return 0; } void terminate() TRT_NOEXCEPT override {} size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override { return maskRcnnInference(batchSize, inputs, outputs, _detections_per_im, _output_size, _num_classes, stream); } void destroy() TRT_NOEXCEPT override { delete this; } const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } void setPluginNamespace(const char *N) TRT_NOEXCEPT override { } // IPluginV2Ext Methods DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override { assert(index < 1); return DataType::kFLOAT; } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; } void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override { assert(*inputTypes == nvinfer1::DataType::kFLOAT && floatFormat == nvinfer1::PluginFormat::kLINEAR); assert(nbInputs == 2); assert(inputDims[0].d[0] == _detections_per_im); assert(inputDims[1].d[0] == _detections_per_im); assert(inputDims[1].d[2] == _output_size); assert(inputDims[1].d[3] == _output_size); _num_classes = inputDims[1].d[1]; } IPluginV2Ext *clone() const TRT_NOEXCEPT override { return new MaskRcnnInferencePlugin(_detections_per_im, _output_size, _num_classes); } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } }; class MaskRcnnInferencePluginCreator : public IPluginCreator { public: MaskRcnnInferencePluginCreator() {} const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } const char *getPluginName() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override { return new MaskRcnnInferencePlugin(serialData, serialLength); } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; } IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; } }; REGISTER_TENSORRT_PLUGIN(MaskRcnnInferencePluginCreator); } // namespace nvinfer1 #undef PLUGIN_NAME #undef PLUGIN_VERSION #undef PLUGIN_NAMESPACE ================================================ FILE: rcnn/PredictorDecode.cu ================================================ #include #include #include #include #include #include #include "PredictorDecodePlugin.h" #include "./cuda_utils.h" #include "macros.h" #ifdef CUDA_11 #include #include #else #include #include namespace cub = thrust::cuda_cub::cub; #endif namespace nvinfer1 { int predictorDecode(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, unsigned int num_boxes, unsigned int num_classes, unsigned int image_height, unsigned int image_width, const std::vector& bbox_reg_weights, void *workspace, size_t workspace_size, cudaStream_t stream) { int scores_size = num_boxes * num_classes; if (!workspace || !workspace_size) { // Return required scratch space size cub style workspace_size = get_size_aligned(bbox_reg_weights.size()); // anchors workspace_size += get_size_aligned(scores_size); // indices workspace_size += get_size_aligned(scores_size); // indices_sorted workspace_size += get_size_aligned(scores_size); // scores_sorted size_t temp_size_sort = 0; cub::DeviceRadixSort::SortPairsDescending( static_cast(nullptr), temp_size_sort, static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), scores_size); workspace_size += temp_size_sort; return workspace_size; } auto bbox_reg_weights_d = get_next_ptr(bbox_reg_weights.size(), workspace, workspace_size); cudaMemcpyAsync(bbox_reg_weights_d, bbox_reg_weights.data(), bbox_reg_weights.size() * sizeof *bbox_reg_weights_d, cudaMemcpyHostToDevice, stream); auto on_stream = thrust::cuda::par.on(stream); auto indices = get_next_ptr(scores_size, workspace, workspace_size); std::vector indices_h(scores_size, 0); for (int i = 0; i < scores_size; i++) indices_h[i] = i; cudaMemcpyAsync(indices, indices_h.data(), scores_size * sizeof(int), cudaMemcpyHostToDevice, stream); auto indices_sorted = get_next_ptr(scores_size, workspace, workspace_size); auto scores_sorted = get_next_ptr(scores_size, workspace, workspace_size); for (int batch = 0; batch < batchSize; batch++) { auto in_scores = static_cast(inputs[0]) + batch * scores_size; auto in_boxes = static_cast(inputs[1]) + batch * scores_size; auto in_proposals = static_cast(inputs[2]) + batch * num_boxes; auto out_scores = static_cast(outputs[0]) + batch * num_boxes; auto out_boxes = static_cast(outputs[1]) + batch * num_boxes; auto out_classes = static_cast(outputs[2]) + batch * num_boxes; // Only keep top n scores cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size, in_scores, scores_sorted, indices, indices_sorted, scores_size, 0, sizeof(*scores_sorted) * 8, stream); // Gather boxes thrust::transform(on_stream, indices_sorted, indices_sorted + num_boxes, thrust::make_zip_iterator(thrust::make_tuple(out_scores, out_boxes, out_classes)), [=] __device__(int i) { int cls = i % num_classes; int n = i / num_classes; float4 deltas = in_boxes[i]; float4 boxes = in_proposals[n]; float w = boxes.z - boxes.x; float h = boxes.w - boxes.y; float pred_ctr_x = (deltas.x / bbox_reg_weights_d[0]) * w + boxes.x + 0.5f * w; float pred_ctr_y = (deltas.y / bbox_reg_weights_d[1]) * h + boxes.y + 0.5f * h; float pred_w = exp(deltas.z / bbox_reg_weights_d[2]) * w; float pred_h = exp(deltas.w / bbox_reg_weights_d[3]) * h; boxes = float4{ max(0.0f, pred_ctr_x - 0.5f * pred_w), max(0.0f, pred_ctr_y - 0.5f * pred_h), min(pred_ctr_x + 0.5f * pred_w, static_cast(image_width)), min(pred_ctr_y + 0.5f * pred_h, static_cast(image_width)) }; // filter empty boxes if (boxes.z - boxes.x <= 0.0f || boxes.w - boxes.y <= 0.0f) return thrust::make_tuple(0.0f, boxes, cls); else return thrust::make_tuple(in_scores[i], boxes, cls); }); } return 0; } } // namespace nvinfer1 ================================================ FILE: rcnn/PredictorDecodePlugin.h ================================================ #pragma once #include #include #include #include "macros.h" using namespace nvinfer1; #define PLUGIN_NAME "PredictorDecode" #define PLUGIN_VERSION "1" #define PLUGIN_NAMESPACE "" namespace nvinfer1 { int predictorDecode(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, unsigned int num_boxes, unsigned int num_classes, unsigned int image_height, unsigned int image_width, const std::vector& bbox_reg_weights, void *workspace, size_t workspace_size, cudaStream_t stream); /* input1: scores{N,C,1,1} N->nums C->num of classes input2: boxes{N,C*4,1,1} N->nums C->num of classes input3: proposals{N,4} N->nums output1: scores{N, 1} N->nums output2: boxes{N, 4} N->nums format:XYXY output3: classes{N, 1} N->nums Description: implement fast rcnn decode */ class PredictorDecodePlugin : public IPluginV2Ext { unsigned int _num_boxes; unsigned int _num_classes; unsigned int _image_height; unsigned int _image_width; std::vector _bbox_reg_weights; mutable int size = -1; protected: void deserialize(void const* data, size_t length) { const char* d = static_cast(data); read(d, _num_boxes); read(d, _num_classes); read(d, _image_height); read(d, _image_width); size_t bbox_reg_weights_size; read(d, bbox_reg_weights_size); while (bbox_reg_weights_size--) { float val; read(d, val); _bbox_reg_weights.push_back(val); } } size_t getSerializationSize() const TRT_NOEXCEPT override { return sizeof(_num_boxes) + sizeof(_num_classes) + sizeof(_image_height) + sizeof(_image_width) + sizeof(size_t) + sizeof(float)*_bbox_reg_weights.size(); } void serialize(void *buffer) const TRT_NOEXCEPT override { char* d = static_cast(buffer); write(d, _num_boxes); write(d, _num_classes); write(d, _image_height); write(d, _image_width); write(d, _bbox_reg_weights.size()); for (auto &val : _bbox_reg_weights) { write(d, val); } } public: PredictorDecodePlugin(unsigned int num_boxes, unsigned int image_height, unsigned int image_width, std::vector const& bbox_reg_weights) : _num_boxes(num_boxes), _image_height(image_height), _image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {} PredictorDecodePlugin(unsigned int num_boxes, unsigned int num_classes, unsigned int image_height, unsigned int image_width, std::vector const& bbox_reg_weights) : _num_boxes(num_boxes), _num_classes(num_classes), _image_height(image_height), _image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {} PredictorDecodePlugin(void const* data, size_t length) { this->deserialize(data, length); } const char *getPluginType() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } int getNbOutputs() const TRT_NOEXCEPT override { return 3; } Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override { assert(nbInputDims == 3); assert(index < this->getNbOutputs()); return Dims2(_num_boxes, (index == 1 ? 4 : 1)); } bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override { return type == DataType::kFLOAT && format == PluginFormat::kLINEAR; } int initialize() TRT_NOEXCEPT override { return 0; } void terminate() TRT_NOEXCEPT override {} size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { if (size < 0) { size = predictorDecode(maxBatchSize, nullptr, nullptr, _num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights, nullptr, 0, nullptr); } return size; } int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override { return predictorDecode(batchSize, inputs, outputs, _num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights, workspace, getWorkspaceSize(batchSize), stream); } void destroy() TRT_NOEXCEPT override { delete this; }; const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} // IPluginV2Ext Methods DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override { assert(index < this->getNbOutputs()); return DataType::kFLOAT; } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; } void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override { assert(*inputTypes == nvinfer1::DataType::kFLOAT && floatFormat == nvinfer1::PluginFormat::kLINEAR); assert(nbInputs == 3); assert(nbOutputs == 3); auto const& scores_dims = inputDims[0]; auto const& boxes_dims = inputDims[1]; auto const& proposals_dims = inputDims[2]; assert(scores_dims.d[0] == _num_boxes); assert(scores_dims.d[0] == boxes_dims.d[0]); assert(scores_dims.d[0] == proposals_dims.d[0]); assert(scores_dims.d[1] * 4 == boxes_dims.d[1]); assert(proposals_dims.d[1] == 4); _num_classes = scores_dims.d[1]; } IPluginV2Ext *clone() const TRT_NOEXCEPT override { return new PredictorDecodePlugin(_num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights); } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } }; class PredictorDecodePluginCreator : public IPluginCreator { public: PredictorDecodePluginCreator() {} const char *getPluginName() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override { return new PredictorDecodePlugin(serialData, serialLength); } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; } IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; } }; REGISTER_TENSORRT_PLUGIN(PredictorDecodePluginCreator); } // namespace nvinfer1 #undef PLUGIN_NAME #undef PLUGIN_VERSION #undef PLUGIN_NAMESPACE ================================================ FILE: rcnn/README.md ================================================ # Rcnn The Pytorch implementation is [facebookresearch/detectron2](https://github.com/facebookresearch/detectron2). Now, outputting instance segmentation results on the original image size and selecting different nms methods are available, which is more convenient for engineering applications. ## Models - [x] Faster R-CNN(C4) - [x] Mask R-CNN(C4) ## Test Environment - GTX3090 / Ubuntu20.04 / cuda11 / cudnn8.0.4 / TensorRT8.1.1 / OpenCV4.5 form docker hakuyyf/tensorrtx:trt8_cuda11 - GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 - GTX2080Ti / win10 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 / VS2017 (need to replace function corresponding to the dirent.h and add "--extended-lambda" in CUDA C/C++ -> Command Line -> Other options) TensorRT7.2 is recomended because Resize layer in 7.0 with kLINEAR mode is a little different with opencv. You can also implement data preprocess out of tensorrt if you want to use TensorRT7.0 or more previous version. TensorRT 8.x is supported and you can use it. **The result under fp32 is same to pytorch about 4 decimal places**! ## Contributors ## How to Run 1. generate .wts from pytorch with .pkl or .pth ``` // git clone -b v0.4 https://github.com/facebookresearch/detectron2.git // go to facebookresearch/detectron2 python setup.py build develop // more install information see https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md // download https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl // download https://raw.githubusercontent.com/freedenS/TestImage/main/demo.jpg // copy tensorrtx/rcnn/gen_wts.py and demo.jpg into facebookresearch/detectron2 // ensure cfg.MODEL.WEIGHTS in gen_wts.py is correct // go to facebookresearch/detectron2 python gen_wts.py // a file 'faster.wts' will be generated. ``` 2. build tensorrtx/rcnn and run ``` // put faster.wts into tensorrtx/rcnn // go to tensorrtx/rcnn // update parameters in rcnn.cpp if your model is trained on custom dataset.The parameters are corresponding to config in detectron2. mkdir build cd build cmake .. make sudo ./rcnn -s [.wts] [m] // serialize model to plan file, add m for maskrcnn sudo ./rcnn -d [.engine] [image folder] [m] // deserialize and run inference, the images in [image folder] will be processed. add m for maskrcnn // For example sudo ./rcnn -s faster.wts faster.engine sudo ./rcnn -d faster.engine ../samples // sudo ./rcnn -s mask.wts mask.engine m // sudo ./rcnn -d mask.engine ../samples m ``` 3. check the images generated, as follows. _demo.jpg and so on. ## Backbone #### R18, R34, R152 ``` // python 1.download pretrained model R18: https://download.pytorch.org/models/resnet18-f37072fd.pth R34: https://download.pytorch.org/models/resnet34-b627a593.pth R50: https://download.pytorch.org/models/resnet50-0676ba61.pth R101: https://download.pytorch.org/models/resnet101-63fe2227.pth R152: https://download.pytorch.org/models/resnet152-394f9c45.pth 2.convert pth to pkl by facebookresearch/detectron2/tools/convert-torchvision-to-d2.py 3.set merge_from_file in gen_wts.py ./configs/COCO-Detections/faster_rcnn_R_50_C4_1x.yaml for fasterRcnn ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml for maskRcnn 4.set cfg.MODEL.RESNETS.DEPTH = 18(34,50,101,152), cfg.MODEL.RESNETS.STRIDE_IN_1X1 = False, cfg.MODEL.RESNETS.RES2_OUT_CHANNELS = 64, // for R18, R34; 256 for others cfg.MODEL.PIXEL_MEAN = [123.675, 116.280, 103.530], cfg.MODEL.PIXEL_STD = [58.395, 57.120, 57.375], cfg.INPUT.FORMAT = "RGB" and then train your own model 5.generate your wts file. // c++ 6.set BACKBONE_RESNETTYPE = R18(R34,R50,R101,R152) in rcnn.cpp line 14 7.modify PIXEL_MEAN and PIXEL_STD in rcnn.cpp 8.set STRIDE_IN_1X1=false in backbone.hpp line 9 9.set other parameters if it's not same with default 10.build your engine, refer to how to run 11.convert your image to RGB before inference ``` #### R50, R101 ``` 1.download pretrained model R50: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl for fasterRcnn https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl for maskRcnn R101: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl for fasterRcnn https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/model_final_a2914c.pkl for maskRcnn 2.set merge_from_file in gen_wts.py R50-faster: ./configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml R101-faster: ./configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml R50-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml R101-mask: ./configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml 3.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 14 4.set STRIDE_IN_1X1=true in backbone.hpp 5.follow how to run ``` ## NOTE - if you meet the error below, just try to make again. The flag has been added in CMakeLists.txt ``` error: __host__ or __device__ annotation on lambda requires --extended-lambda nvcc flag ``` - the image preprocess of sizing and padding was moved out from tensorrt, see DataPreprocess in rcnn.cpp, so the input data is {H, W, C} - now, left-right and top-bottom padding preprocessings are optionally available in preprocessImg of common.hpp, and you can set arbitrary sizes of INPUT_H_ and INPUT_W_ - the predicted boxes is corresponding to new image size containing padding, so the final boxes need to subtract padding size and multiply with the ratio, see preprocessImg in common.hpp and calculateSize in rcnn.cpp - tensorrt use fixed input size, if the size of your data is different from the engine, you need to adjust your data and the result. - if you want to use maskrcnn with cuda10.2, please be sure that you have upgraded cuda to the latest patch. see https://github.com/NVIDIA/TensorRT/issues/1151 for detail. - you can build fasterRcnn with maskRcnn weights file. - do initializing for _pre_nms_topk in RpnNmsPlugin, _count in BatchedNmsPlugin and _num_classes in MaskRcnnInferencePlugin inside class to prevent error assert, because the configurePlugin function is implemented after clone() and before serialize(). one can also set it through constructor. ## Quantization 1. quantizationType:fp32,fp16,int8. see BuildRcnnModel(rcnn.cpp line 345) for detail. 2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md). ## Latency average cost of doInference(in rcnn.cpp) from second time with batch=1 under the ubuntu environment above, input size: 640(w)*480(h) | | fp32 | fp16 | int8 | | ------------- | ----- | ---- | ---- | | Faster-R50C4 | 138ms | 36ms | 30ms | | Faster-R101C4 | 146ms | 38ms | 32ms | | Mask-R50C4 | 153ms | 44ms | 33ms | | Mask-R101C4 | 168ms | 45ms | 35ms | ## Plugins decode and nms plugins are modified from [retinanet-examples](https://github.com/NVIDIA/retinanet-examples/tree/master/csrc/plugins) - RpnDecodePlugin: calculate coordinates of proposals which is the first n ``` parameters: top_n: num of proposals to select anchors: coordinates of all anchors stride: stride of current feature map image_height: iamge height after DataPreprocess for clipping the box beyond the boundary image_width: iamge width after DataPreprocess for clipping the box beyond the boundary Inputs: scores{C,H,W} C is number of anchors, H and W are the size of feature map boxes{C,H,W} C is 4*number of anchors, H and W are the size of feature map Outputs: scores{C,1} C is equal to top_n boxes{C,4} C is equal to top_n ``` - RpnNmsPlugin: apply nms to proposals ``` parameters: nms_thresh: thresh of nms post_nms_topk: number of proposals to select Inputs: scores{C,1} C is equal to top_n boxes{C,4} C is equal to top_n Outputs: boxes{C,4} C is equal to post_nms_topk ``` - RoiAlignPlugin: implement of RoiAlign(align=True). see https://github.com/facebookresearch/detectron2/blob/f50ec07cf220982e2c4861c5a9a17c4864ab5bfd/detectron2/layers/roi_align.py#L7 for detail ``` parameters: pooler_resolution: output size spatial_scale: scale the input boxes by this number sampling_ratio: number of inputs samples to take for each output num_proposals: number of proposals Inputs: boxes{N,4} N is number of boxes features{C,H,W} C is channels of feature map, H and W are sizes of feature map Outputs: features{N,C,H,W} N is number of boxes, C is channels of feature map, H and W are equal to pooler_resolution ``` - PredictorDecodePlugin: calculate coordinates of predicted boxes by applying delta to proposals ``` parameters: num_boxes: num of proposals image_height: iamge height after DataPreprocess for clipping the box beyond the boundary image_width: iamge width after DataPreprocess for clipping the box beyond the boundary bbox_reg_weights: the weights for dx,dy,dw,dh. see https://github.com/facebookresearch/detectron2/blob/master/detectron2/config/defaults.py#L292 for detail Inputs: scores{N,C,1,1} N is euqal to num_boxes, C is the num of classes boxes{N,C,1,1} N is euqal to num_boxes, C is the num of classes proposals{N,4} N is equal to num_boxes Outputs: scores{N,1} N is equal to num_boxes boxes{N,4} N is equal to num_boxes classes{N,1} N is equal to num_boxes ``` - BatchedNmsPlugin: apply nms to predicted boxes with different classes. same with https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/nms.py#L19 ``` parameters: nms_thresh: thresh of nms detections_per_im: number of detections to return per image Inputs: scores{N,1} N is the number of the boxes boxes{N,4} N is the number of the boxes classes{N,1} N is the number of the boxes Outputs: scores{N,1} N is equal to detections_per_im boxes{N,4} N is equal to detections_per_im classes{N,1} N is equal to detections_per_im ``` - MaskRcnnInferencePlugin: extract the masks for the predicted classes and do sigmoid. same with https://github.com/facebookresearch/detectron2/blob/9c7f8a142216ebc52d3617c11f8fafd75b74e637/detectron2/modeling/roi_heads/mask_head.py#L114 ``` parameters: detections_per_im: number of detections to return per image output_size: same with output size of RoiAlign Inputs: indices{N,1} N is the number of the predicted boxes masks{N,C,H,W} N is the number of the predicted boxes Outputs: selected_masks{N,1,H,W} N is the number of the predicted boxes, H and W is equal to output_size ``` ================================================ FILE: rcnn/RoiAlign.cu ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include "RoiAlignPlugin.h" #include "./cuda_utils.h" #include "macros.h" #ifdef CUDA_11 #include #include #else #include #include namespace cub = thrust::cuda_cub::cub; #endif namespace nvinfer1 { template __device__ T bilinear_interpolate( const T* bottom_data, const int height, const int width, T y, T x) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty return 0; } if (y <= 0) { y = 0; } if (x <= 0) { x = 0; } int y_low = static_cast(y); int x_low = static_cast(x); int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = bottom_data[y_low * width + x_low]; T v2 = bottom_data[y_low * width + x_high]; T v3 = bottom_data[y_high * width + x_low]; T v4 = bottom_data[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; // mode Avg return val; } __global__ void RoIAlignForward( const int nthreads, const float* bottom_data, const float spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const float4* bottom_rois, float* top_data) { for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const float4* offset_bottom_rois = bottom_rois + n; // Do not using rounding; this implementation detail is critical float roi_offset = 0.5f; float roi_start_w = offset_bottom_rois->x * spatial_scale - roi_offset; float roi_start_h = offset_bottom_rois->y * spatial_scale - roi_offset; float roi_end_w = offset_bottom_rois->z * spatial_scale - roi_offset; float roi_end_h = offset_bottom_rois->w * spatial_scale - roi_offset; float roi_width = roi_end_w - roi_start_w; float roi_height = roi_end_h - roi_start_h; float bin_size_h = static_cast(roi_height) / static_cast(pooled_height); float bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const float* offset_bottom_data = bottom_data + static_cast(c * height * width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const float count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 float output_val = 0.f; // bool max_flag = false; // e.g., iy = 0, 1 for (int iy = 0; iy < roi_bin_grid_h; iy++) { const float y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const float x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); float val = bilinear_interpolate( offset_bottom_data, height, width, y, x); output_val += val; } } output_val /= count; top_data[index] = output_val; } } int roiAlign(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, int pooler_resolution, float spatial_scale, int sampling_ratio, int num_proposals, int out_channels, int feature_h, int feature_w, cudaStream_t stream) { for (int batch = 0; batch < batchSize; batch++) { auto in_boxes = static_cast(inputs[0]) + batch * num_proposals; auto in_features = static_cast(inputs[1]) + batch * out_channels * feature_h * feature_w; int nthreads = num_proposals * out_channels * pooler_resolution * pooler_resolution; auto out_features = static_cast(outputs[0]) + batch * nthreads; const int max_threads = 1024; int blocksPerGrid = ceil(static_cast(nthreads) / max_threads); RoIAlignForward<< > > ( nthreads, in_features, spatial_scale, out_channels, feature_h, feature_w, pooler_resolution, pooler_resolution, sampling_ratio, in_boxes, out_features); cudaDeviceSynchronize(); } return 0; } } // namespace nvinfer1 ================================================ FILE: rcnn/RoiAlignPlugin.h ================================================ #pragma once #include #include #include #include "macros.h" using namespace nvinfer1; #define PLUGIN_NAME "RoiAlign" #define PLUGIN_VERSION "1" #define PLUGIN_NAMESPACE "" namespace nvinfer1 { int roiAlign(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, int pooler_resolution, float spatial_scale, int sampling_ratio, int num_proposals, int out_channels, int feature_h, int feature_w, cudaStream_t stream); /* input1: boxes{N,4} N->post_nms_topk input2: features{C,H,W} C->num of feature map channels output1: features{N, C, H, W} N:nums of proposals C:output out_channels H,W:roialign size Description: roialign */ class RoiAlignPlugin : public IPluginV2Ext { int _pooler_resolution; float _spatial_scale; int _sampling_ratio; int _num_proposals; int _out_channels; int _feature_h; int _feature_w; protected: void deserialize(void const* data, size_t length) { const char* d = static_cast(data); read(d, _pooler_resolution); read(d, _spatial_scale); read(d, _sampling_ratio); read(d, _num_proposals); read(d, _out_channels); read(d, _feature_h); read(d, _feature_w); } size_t getSerializationSize() const TRT_NOEXCEPT override { return sizeof(_pooler_resolution) + sizeof(_spatial_scale) + sizeof(_sampling_ratio) + sizeof(_num_proposals) + sizeof(_out_channels) + sizeof(_feature_h) + sizeof(_feature_w); } void serialize(void *buffer) const TRT_NOEXCEPT override { char* d = static_cast(buffer); write(d, _pooler_resolution); write(d, _spatial_scale); write(d, _sampling_ratio); write(d, _num_proposals); write(d, _out_channels); write(d, _feature_h); write(d, _feature_w); } public: RoiAlignPlugin(int pooler_resolution, float spatial_scale, int sampling_ratio, int num_proposals, int out_channels) : _pooler_resolution(pooler_resolution), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio), _num_proposals(num_proposals), _out_channels(out_channels) {} RoiAlignPlugin(int pooler_resolution, float spatial_scale, int sampling_ratio, int num_proposals, int out_channels, int feature_h, int feature_w) : _pooler_resolution(pooler_resolution), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio), _num_proposals(num_proposals), _out_channels(out_channels), _feature_h(feature_h), _feature_w(feature_w) {} RoiAlignPlugin(void const* data, size_t length) { this->deserialize(data, length); } const char *getPluginType() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override { assert(index < this->getNbOutputs()); return Dims4(_num_proposals, _out_channels, _pooler_resolution, _pooler_resolution); } bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override { return type == DataType::kFLOAT && format == PluginFormat::kLINEAR; } int initialize() TRT_NOEXCEPT override { return 0; } void terminate() TRT_NOEXCEPT override {} size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override { return roiAlign(batchSize, inputs, outputs, _pooler_resolution, _spatial_scale, _sampling_ratio, _num_proposals, _out_channels, _feature_h, _feature_w, stream); } void destroy() TRT_NOEXCEPT override { delete this; }; const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } void setPluginNamespace(const char *N) TRT_NOEXCEPT override { } // IPluginV2Ext Methods DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override { assert(index < this->getNbOutputs()); return DataType::kFLOAT; } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; } void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override { assert(*inputTypes == nvinfer1::DataType::kFLOAT && floatFormat == nvinfer1::PluginFormat::kLINEAR); assert(nbInputs == 2); assert(nbOutputs == 1); auto const& boxes_dims = inputDims[0]; auto const& feature_dims = inputDims[1]; assert(_num_proposals == boxes_dims.d[0]); assert(_out_channels == feature_dims.d[0]); _feature_h = feature_dims.d[1]; _feature_w = feature_dims.d[2]; } IPluginV2Ext *clone() const TRT_NOEXCEPT override { return new RoiAlignPlugin(_pooler_resolution, _spatial_scale, _sampling_ratio, _num_proposals, _out_channels, _feature_h, _feature_w); } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } }; class RoiAlignPluginCreator : public IPluginCreator { public: RoiAlignPluginCreator() {} const char *getPluginName() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override { return new RoiAlignPlugin(serialData, serialLength); } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; } IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; } }; REGISTER_TENSORRT_PLUGIN(RoiAlignPluginCreator); } // namespace nvinfer1 #undef PLUGIN_NAME #undef PLUGIN_VERSION #undef PLUGIN_NAMESPACE ================================================ FILE: rcnn/RpnDecode.cu ================================================ #include #include #include #include #include #include #include #include #include #include "RpnDecodePlugin.h" #include "./cuda_utils.h" #include "macros.h" #ifdef CUDA_11 #include #include #else #include #include namespace cub = thrust::cuda_cub::cub; #endif namespace nvinfer1 { int rpnDecode(int batch_size, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, size_t height, size_t width, size_t image_height, size_t image_width, float stride, const std::vector &anchors, int top_n, void *workspace, size_t workspace_size, cudaStream_t stream) { size_t num_anchors = anchors.size() / 4; int scores_size = num_anchors * height * width; if (!workspace || !workspace_size) { // Return required scratch space size cub style workspace_size = get_size_aligned(anchors.size()); // anchors workspace_size += get_size_aligned(scores_size); // indices workspace_size += get_size_aligned(scores_size); // indices_sorted workspace_size += get_size_aligned(scores_size); // scores_sorted size_t temp_size_sort = 0; if (scores_size > top_n) { cub::DeviceRadixSort::SortPairsDescending( static_cast(nullptr), temp_size_sort, static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), scores_size); workspace_size += temp_size_sort; } return workspace_size; } auto anchors_d = get_next_ptr(anchors.size(), workspace, workspace_size); cudaMemcpyAsync(anchors_d, anchors.data(), anchors.size() * sizeof *anchors_d, cudaMemcpyHostToDevice, stream); auto on_stream = thrust::cuda::par.on(stream); auto indices = get_next_ptr(scores_size, workspace, workspace_size); // TODO: how to generate sequence on gpu directly? std::vector indices_h(scores_size); for (int i = 0; i < scores_size; i++) indices_h[i] = i; cudaMemcpyAsync(indices, indices_h.data(), scores_size * sizeof * indices, cudaMemcpyHostToDevice, stream); auto indices_sorted = get_next_ptr(scores_size, workspace, workspace_size); auto scores_sorted = get_next_ptr(scores_size, workspace, workspace_size); for (int batch = 0; batch < batch_size; batch++) { auto in_scores = static_cast(inputs[0]) + batch * scores_size; auto in_boxes = static_cast(inputs[1]) + batch * scores_size * 4; auto out_scores = static_cast(outputs[0]) + batch * top_n; auto out_boxes = static_cast(outputs[1]) + batch * top_n; // Only keep top n scores int num_detections = scores_size; auto indices_filtered = indices; if (num_detections > top_n) { cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size, in_scores, scores_sorted, indices, indices_sorted, scores_size, 0, sizeof(*scores_sorted) * 8, stream); indices_filtered = indices_sorted; num_detections = top_n; } // Gather boxes bool has_anchors = !anchors.empty(); thrust::transform(on_stream, indices_filtered, indices_filtered + num_detections, thrust::make_zip_iterator(thrust::make_tuple(out_scores, out_boxes)), [=] __device__(int i) { int x = i % width; int y = (i / width) % height; int a = (i / height / width) % num_anchors; float4 box = float4{ in_boxes[((a * 4 + 0) * height + y) * width + x], in_boxes[((a * 4 + 1) * height + y) * width + x], in_boxes[((a * 4 + 2) * height + y) * width + x], in_boxes[((a * 4 + 3) * height + y) * width + x] }; if (has_anchors) { // Add anchors offsets to deltas float x = (i % width) * stride; float y = ((i / width) % height) * stride; float *d = anchors_d + 4 * a; float x1 = x + d[0]; float y1 = y + d[1]; float x2 = x + d[2]; float y2 = y + d[3]; float w = x2 - x1; float h = y2 - y1; float pred_ctr_x = box.x * w + x1 + 0.5f * w; float pred_ctr_y = box.y * h + y1 + 0.5f * h; float pred_w = exp(box.z) * w; float pred_h = exp(box.w) * h; // TODO: set image size as parameter box = float4{ max(0.0f, pred_ctr_x - 0.5f * pred_w), max(0.0f, pred_ctr_y - 0.5f * pred_h), min(pred_ctr_x + 0.5f * pred_w, static_cast(image_width)), min(pred_ctr_y + 0.5f * pred_h, static_cast(image_height)) }; } // filter empty boxes if (box.z - box.x <= 0.0f || box.w - box.y <= 0.0f) return thrust::make_tuple(-FLT_MAX, box); else return thrust::make_tuple(in_scores[i], box); }); // Zero-out unused scores if (num_detections < top_n) { thrust::fill(on_stream, out_scores + num_detections, out_scores + top_n, -FLT_MAX); } } return 0; } } // namespace nvinfer1 ================================================ FILE: rcnn/RpnDecodePlugin.h ================================================ #pragma once #include #include #include #include "macros.h" using namespace nvinfer1; #define PLUGIN_NAME "RpnDecode" #define PLUGIN_VERSION "1" #define PLUGIN_NAMESPACE "" namespace nvinfer1 { int rpnDecode(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, size_t height, size_t width, size_t image_height, size_t image_width, float stride, const std::vector &anchors, int top_n, void *workspace, size_t workspace_size, cudaStream_t stream); /* input1: scores{C,H,W} C->anchors input2: boxes{C,H,W} C->4*anchors output1: scores{C, 1} C->topk output2: boxes{C, 4} C->topk format:XYXY Description: implement anchor decode */ class RpnDecodePlugin : public IPluginV2Ext { int _top_n; std::vector _anchors; float _stride; size_t _height; size_t _width; size_t _image_height; // for cliping the boxes by limiting y coordinates to the range [0, height] size_t _image_width; // for cliping the boxes by limiting x coordinates to the range [0, width] mutable int size = -1; protected: void deserialize(void const* data, size_t length) { const char* d = static_cast(data); read(d, _top_n); size_t anchors_size; read(d, anchors_size); while (anchors_size--) { float val; read(d, val); _anchors.push_back(val); } read(d, _stride); read(d, _height); read(d, _width); read(d, _image_height); read(d, _image_width); } size_t getSerializationSize() const TRT_NOEXCEPT override { return sizeof(_top_n) + sizeof(size_t) + sizeof(float) * _anchors.size() + sizeof(_stride) + sizeof(_height) + sizeof(_width) + sizeof(_image_height) + sizeof(_image_width); } void serialize(void *buffer) const TRT_NOEXCEPT override { char* d = static_cast(buffer); write(d, _top_n); write(d, _anchors.size()); for (auto &val : _anchors) { write(d, val); } write(d, _stride); write(d, _height); write(d, _width); write(d, _image_height); write(d, _image_width); } public: RpnDecodePlugin(int top_n, std::vector const& anchors, float stride, size_t image_height, size_t image_width) : _top_n(top_n), _anchors(anchors), _stride(stride), _image_height(image_height), _image_width(image_width) {} RpnDecodePlugin(int top_n, std::vector const& anchors, float stride, size_t height, size_t width, size_t image_height, size_t image_width) : _top_n(top_n), _anchors(anchors), _stride(stride), _height(height), _width(width), _image_height(image_height), _image_width(image_width) {} RpnDecodePlugin(void const* data, size_t length) { this->deserialize(data, length); } const char *getPluginType() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } int getNbOutputs() const TRT_NOEXCEPT override { return 2; } Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override { assert(nbInputDims == 2); assert(index < this->getNbOutputs()); return Dims2(_top_n, (index == 1 ? 4 : 1)); } bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override { return type == DataType::kFLOAT && format == PluginFormat::kLINEAR; } int initialize() TRT_NOEXCEPT override { return 0; } void terminate() TRT_NOEXCEPT override {} size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { if (size < 0) { size = rpnDecode(maxBatchSize, nullptr, nullptr, _height, _width, _image_height, _image_width, _stride, _anchors, _top_n, nullptr, 0, nullptr); } return size; } int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override { return rpnDecode(batchSize, inputs, outputs, _height, _width, _image_height, _image_width, _stride, _anchors, _top_n, workspace, getWorkspaceSize(batchSize), stream); } void destroy() TRT_NOEXCEPT override { delete this; }; const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } void setPluginNamespace(const char *N) TRT_NOEXCEPT override { } // IPluginV2Ext Methods DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override { assert(index < 3); return DataType::kFLOAT; } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; } void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override { assert(*inputTypes == nvinfer1::DataType::kFLOAT && floatFormat == nvinfer1::PluginFormat::kLINEAR); assert(nbInputs == 2); assert(nbOutputs == 2); auto const& scores_dims = inputDims[0]; auto const& boxes_dims = inputDims[1]; assert(scores_dims.d[1] == boxes_dims.d[1]); assert(scores_dims.d[2] == boxes_dims.d[2]); _height = scores_dims.d[1]; _width = scores_dims.d[2]; } IPluginV2Ext *clone() const TRT_NOEXCEPT override { return new RpnDecodePlugin(_top_n, _anchors, _stride, _height, _width, _image_height, _image_width); } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } }; class RpnDecodePluginCreator : public IPluginCreator { public: RpnDecodePluginCreator() {} const char *getPluginName() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override { return new RpnDecodePlugin(serialData, serialLength); } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; } IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; } }; REGISTER_TENSORRT_PLUGIN(RpnDecodePluginCreator); } // namespace nvinfer1 #undef PLUGIN_NAME #undef PLUGIN_VERSION #undef PLUGIN_NAMESPACE ================================================ FILE: rcnn/RpnNms.cu ================================================ #include #include #include #include #include #include #include #include #include #include "RpnNmsPlugin.h" #include "./cuda_utils.h" #include "macros.h" #ifdef CUDA_11 #include #include #else #include #include namespace cub = thrust::cuda_cub::cub; #endif namespace nvinfer1 { __global__ void rpn_nms_kernel( const float threshold, const int num_detections, const int *indices, float *scores, const float4 *boxes) { // Go through detections by descending score for (int m = 0; m < num_detections; m++) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < num_detections && m < i && scores[m] > -FLT_MAX) { int idx = indices[i]; int max_idx = indices[m]; float4 ibox = boxes[idx]; float4 mbox = boxes[max_idx]; float x1 = max(ibox.x, mbox.x); float y1 = max(ibox.y, mbox.y); float x2 = min(ibox.z, mbox.z); float y2 = min(ibox.w, mbox.w); float w = max(0.0f, x2 - x1); float h = max(0.0f, y2 - y1); float iarea = (ibox.z - ibox.x) * (ibox.w - ibox.y); float marea = (mbox.z - mbox.x) * (mbox.w - mbox.y); float inter = w * h; float overlap = inter / (iarea + marea - inter); if (overlap > threshold) { scores[i] = -FLT_MAX; } } // Sync discarded detections __syncthreads(); } } int rpnNms(int batch_size, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, size_t pre_nms_topk, int post_nms_topk, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) { if (!workspace || !workspace_size) { // Return required scratch space size cub style workspace_size += get_size_aligned(pre_nms_topk); // indices workspace_size += get_size_aligned(pre_nms_topk); // indices_sorted workspace_size += get_size_aligned(pre_nms_topk); // scores workspace_size += get_size_aligned(pre_nms_topk); // scores_sorted size_t temp_size_sort = 0; cub::DeviceRadixSort::SortPairsDescending( static_cast(nullptr), temp_size_sort, static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), static_cast(nullptr), pre_nms_topk); workspace_size += temp_size_sort; return workspace_size; } auto on_stream = thrust::cuda::par.on(stream); auto indices = get_next_ptr(pre_nms_topk, workspace, workspace_size); std::vector indices_h(pre_nms_topk); for (int i = 0; i < pre_nms_topk; i++) indices_h[i] = i; cudaMemcpyAsync(indices, indices_h.data(), pre_nms_topk * sizeof * indices, cudaMemcpyHostToDevice, stream); auto indices_sorted = get_next_ptr(pre_nms_topk, workspace, workspace_size); auto scores = get_next_ptr(pre_nms_topk, workspace, workspace_size); auto scores_sorted = get_next_ptr(pre_nms_topk, workspace, workspace_size); for (int batch = 0; batch < batch_size; batch++) { auto in_scores = static_cast(inputs[0]) + batch * pre_nms_topk; auto in_boxes = static_cast(inputs[1]) + batch * pre_nms_topk; auto out_boxes = static_cast(outputs[0]) + batch * post_nms_topk; int num_detections = pre_nms_topk; cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size, in_scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores_sorted) * 8, stream); // Launch actual NMS kernel - 1 block with each thread handling n detections // TODO: different device has differnet max threads const int max_threads = 1024; int num_per_thread = ceil(static_cast(num_detections) / max_threads); rpn_nms_kernel << > > (nms_thresh, num_detections, indices_sorted, scores_sorted, in_boxes); // Re-sort with updated scores cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size, scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores_sorted) * 8, stream); // Gather filtered scores, boxes, classes num_detections = min(post_nms_topk, num_detections); thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes); } return 0; } } // namespace nvinfer1 ================================================ FILE: rcnn/RpnNmsPlugin.h ================================================ #pragma once #include #include #include #include "macros.h" using namespace nvinfer1; #define PLUGIN_NAME "RpnNms" #define PLUGIN_VERSION "1" #define PLUGIN_NAMESPACE "" namespace nvinfer1 { int rpnNms(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, size_t pre_nms_topk, int post_nms_topk, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream); /* input1: scores{C, 1} C->pre_nms_topk input2: boxes{C, 4} C->pre_nms_topk format:XYXY output1: boxes{C, 4} C->post_nms_topk format:XYXY Description: implement rpn nms */ class RpnNmsPlugin : public IPluginV2Ext { float _nms_thresh; int _post_nms_topk; size_t _pre_nms_topk = 1; mutable int size = -1; protected: void deserialize(void const* data, size_t length) { const char* d = static_cast(data); read(d, _nms_thresh); read(d, _post_nms_topk); read(d, _pre_nms_topk); } size_t getSerializationSize() const TRT_NOEXCEPT override { return sizeof(_nms_thresh) + sizeof(_post_nms_topk) + sizeof(_pre_nms_topk); } void serialize(void *buffer) const TRT_NOEXCEPT override { char* d = static_cast(buffer); write(d, _nms_thresh); write(d, _post_nms_topk); write(d, _pre_nms_topk); } public: RpnNmsPlugin(float nms_thresh, int post_nms_topk) : _nms_thresh(nms_thresh), _post_nms_topk(post_nms_topk) { assert(nms_thresh > 0); assert(post_nms_topk > 0); } RpnNmsPlugin(float nms_thresh, int post_nms_topk, size_t pre_nms_topk) : _nms_thresh(nms_thresh), _post_nms_topk(post_nms_topk), _pre_nms_topk(pre_nms_topk) { assert(nms_thresh > 0); assert(post_nms_topk > 0); assert(pre_nms_topk > 0); } RpnNmsPlugin(void const* data, size_t length) { this->deserialize(data, length); } const char *getPluginType() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override { assert(nbInputDims == 2); assert(index < this->getNbOutputs()); return Dims2(_post_nms_topk, 4); } bool supportsFormat(DataType type, PluginFormat format) const TRT_NOEXCEPT override { return type == DataType::kFLOAT && format == PluginFormat::kLINEAR; } int initialize() TRT_NOEXCEPT override { return 0; } void terminate() TRT_NOEXCEPT override {} size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { if (size < 0) { size = rpnNms(maxBatchSize, nullptr, nullptr, _pre_nms_topk, _post_nms_topk, _nms_thresh, nullptr, 0, nullptr); } return size; } int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE*outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override { return rpnNms(batchSize, inputs, outputs, _pre_nms_topk, _post_nms_topk, _nms_thresh, workspace, getWorkspaceSize(batchSize), stream); } void destroy() TRT_NOEXCEPT override { delete this; } const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } void setPluginNamespace(const char *N) TRT_NOEXCEPT override { } // IPluginV2Ext Methods DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override { assert(index < 1); return DataType::kFLOAT; } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override { return false; } void configurePlugin(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, const DataType* inputTypes, const DataType* outputTypes, const bool* inputIsBroadcast, const bool* outputIsBroadcast, PluginFormat floatFormat, int maxBatchSize) TRT_NOEXCEPT override { assert(*inputTypes == nvinfer1::DataType::kFLOAT && floatFormat == nvinfer1::PluginFormat::kLINEAR); assert(nbInputs == 2); assert(inputDims[0].d[0] == inputDims[1].d[0]); _pre_nms_topk = inputDims[0].d[0]; } IPluginV2Ext *clone() const TRT_NOEXCEPT override { return new RpnNmsPlugin(_nms_thresh, _post_nms_topk, _pre_nms_topk); } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } }; class RpnNmsPluginCreator : public IPluginCreator { public: RpnNmsPluginCreator() {} const char *getPluginNamespace() const TRT_NOEXCEPT override { return PLUGIN_NAMESPACE; } const char *getPluginName() const TRT_NOEXCEPT override { return PLUGIN_NAME; } const char *getPluginVersion() const TRT_NOEXCEPT override { return PLUGIN_VERSION; } IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override { return new RpnNmsPlugin(serialData, serialLength); } void setPluginNamespace(const char *N) TRT_NOEXCEPT override {} const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return nullptr; } IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override { return nullptr; } }; REGISTER_TENSORRT_PLUGIN(RpnNmsPluginCreator); } // namespace nvinfer1 #undef PLUGIN_NAME #undef PLUGIN_VERSION #undef PLUGIN_NAMESPACE ================================================ FILE: rcnn/backbone.hpp ================================================ #pragma once #include #include #include #include "common.hpp" /* when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution. set false when use backbone from torchvision*/ #define STRIDE_IN_1X1 true enum RESNETTYPE { R18 = 0, R34, R50, R101, R152 }; const std::map> num_blocks_per_stage = { {R18, {2, 2, 2, 2}}, {R34, {3, 4, 6, 3}}, {R50, {3, 4, 6, 3}}, {R101, {3, 4, 23, 3}}, {R152, {3, 8, 36, 3}} }; ILayer* BasicStem(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int out_channels, int group_num = 1) { // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 7, 7 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ 2, 2 }); conv1->setPaddingNd(DimsHW{ 3, 3 }); conv1->setNbGroups(group_num); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); auto max_pool2d = network->addPoolingNd(*r1->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 }); max_pool2d->setStrideNd(DimsHW{ 2, 2 }); max_pool2d->setPaddingNd(DimsHW{ 1, 1 }); // auto mp_dim = max_pool2d->getOutput(0)->getDimensions(); return max_pool2d; } ITensor* BasicBlock(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int in_channels, int out_channels, int stride = 1) { // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd(input, out_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ stride, stride }); conv1->setPaddingNd(DimsHW{ 1, 1 }); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); // conv2 IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), out_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{ 1, 1 }); conv2->setPaddingNd(DimsHW{ 1, 1 }); // shortcut ITensor* shortcut_value = nullptr; if (in_channels != out_channels) { auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".shortcut.weight"], weightMap[lname + ".shortcut.bias"]); assert(shortcut); shortcut->setStrideNd(DimsHW{ stride, stride }); shortcut_value = shortcut->getOutput(0); } else { shortcut_value = &input; } // add auto ew = network->addElementWise(*conv2->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM); assert(ew); auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU); assert(r3); return r3->getOutput(0); } ITensor* BottleneckBlock(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int in_channels, int bottleneck_channels, int out_channels, int stride = 1, int dilation = 1, int group_num = 1) { int stride_1x1 = STRIDE_IN_1X1 ? stride : 1; int stride_3x3 = STRIDE_IN_1X1 ? 1 : stride; // conv1 IConvolutionLayer* conv1 = network->addConvolutionNd(input, bottleneck_channels, DimsHW{ 1, 1 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ stride_1x1, stride_1x1 }); conv1->setNbGroups(group_num); auto r1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(r1); // conv2 IConvolutionLayer* conv2 = network->addConvolutionNd(*r1->getOutput(0), bottleneck_channels, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); assert(conv2); conv2->setStrideNd(DimsHW{ stride_3x3, stride_3x3 }); conv2->setPaddingNd(DimsHW{ 1 * dilation, 1 * dilation }); conv2->setDilationNd(DimsHW{ dilation, dilation }); conv2->setNbGroups(group_num); auto r2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); assert(r2); // conv3 IConvolutionLayer* conv3 = network->addConvolutionNd(*r2->getOutput(0), out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".conv3.weight"], weightMap[lname + ".conv3.bias"]); assert(conv3); conv3->setStrideNd(DimsHW{ 1, 1 }); conv3->setNbGroups(group_num); // shortcut ITensor* shortcut_value = nullptr; if (in_channels != out_channels) { auto shortcut = network->addConvolutionNd(input, out_channels, DimsHW{ 1, 1 }, weightMap[lname + ".shortcut.weight"], weightMap[lname + ".shortcut.bias"]); assert(shortcut); shortcut->setStrideNd(DimsHW{stride, stride}); shortcut->setNbGroups(group_num); shortcut_value = shortcut->getOutput(0); } else { shortcut_value = &input; } // add auto ew = network->addElementWise(*conv3->getOutput(0), *shortcut_value, ElementWiseOperation::kSUM); assert(ew); auto r3 = network->addActivation(*ew->getOutput(0), ActivationType::kRELU); assert(r3); return r3->getOutput(0); } ITensor* MakeStage(INetworkDefinition *network, std::map& weightMap, const std::string& lname, ITensor& input, int stage, RESNETTYPE resnet_type, int in_channels, int bottleneck_channels, int out_channels, int first_stride = 1, int dilation = 1) { ITensor* out = &input; for (int i = 0; i < stage; i++) { std::string layerName = lname + "." + std::to_string(i); int stride = i == 0 ? first_stride : 1; if (resnet_type == R18 || resnet_type == R34) out = BasicBlock(network, weightMap, layerName, *out, in_channels, out_channels, stride); else out = BottleneckBlock(network, weightMap, layerName, *out, in_channels, bottleneck_channels, out_channels, stride, dilation); in_channels = out_channels; } return out; } ITensor* BuildResNet(INetworkDefinition *network, std::map& weightMap, ITensor& input, RESNETTYPE resnet_type, int stem_out_channels, int bottleneck_channels, int res2_out_channels, int res5_dilation = 1) { assert(res5_dilation == 1 || res5_dilation == 2); // "res5_dilation must be 1 or 2" if (resnet_type == R18 || resnet_type == R34) { assert(res2_out_channels == 64); // "res2_out_channels must be 64 for R18/R34" assert(res5_dilation == 1); // "res5_dilation must be 1 for R18/R34" } int out_channels = res2_out_channels; ITensor* out = nullptr; // stem auto stem = BasicStem(network, weightMap, "backbone.stem", input, stem_out_channels); out = stem->getOutput(0); // res for (int i = 0; i < 3; i++) { int dilation = (i == 3) ? res5_dilation : 1; int first_stride = (i == 0 || (i == 3 && dilation == 2)) ? 1 : 2; out = MakeStage(network, weightMap, "backbone.res" + std::to_string(i + 2), *out, num_blocks_per_stage.at(resnet_type)[i], resnet_type, stem_out_channels, bottleneck_channels, out_channels, first_stride, dilation); stem_out_channels = out_channels; bottleneck_channels *= 2; out_channels *= 2; } return out; } ================================================ FILE: rcnn/calibrator.hpp ================================================ #pragma once #include "NvInfer.h" #include #include #include #include #include #include #include "./cuda_utils.h" #include "common.hpp" #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > static_cast(img_files_.size())) { return false; } std::vector input_imgs_(input_count_, 0); for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); int X_LEFT_PAD = 0; int X_RIGHT_PAD = 0; int Y_TOP_PAD = 0; int Y_BOTTOM_PAD = 0; temp = preprocessImg(temp, input_w_, input_h_, X_LEFT_PAD, X_RIGHT_PAD, Y_TOP_PAD, Y_BOTTOM_PAD); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } for (int ind = 0; ind < input_w_*input_h_*3; ind++) input_imgs_[(i-img_idx_)*input_w_*input_h_*3 + ind] = static_cast(*(temp.data + ind)); } img_idx_ += batchsize_; CUDA_CHECK(cudaMemcpy(device_input_, input_imgs_.data(), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: rcnn/common.hpp ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "./logging.h" #include "./cuda_utils.h" static Logger gLogger; using namespace nvinfer1; void loadWeights(const std::string file, std::map& weightMap) { std::cout << "Loading weights: " << file << std::endl; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { // std::string cur_file_name(p_dir_name); // cur_file_name += "/"; // cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } static inline cv::Mat preprocessImg(cv::Mat& img, int input_w, int input_h, int& X_LEFT_PAD, int& X_RIGHT_PAD, int& Y_TOP_PAD, int& Y_BOTTOM_PAD) { int w, h; float x, y; float r_w = input_w / (img.cols*1.0); float r_h = input_h / (img.rows*1.0); // this code can also support left-right and top-bottom padding if you need if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0.0; y = (input_h - h) / 2.f; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2.f; y = 0.0; } // support both odd and even cases X_LEFT_PAD = (int)(round(x - 0.1)); X_RIGHT_PAD = (int)(round(x + 0.1)); Y_TOP_PAD = (int)(round(y - 0.1)); Y_BOTTOM_PAD = (int)(round(y + 0.1)); cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(X_LEFT_PAD, Y_TOP_PAD, re.cols, re.rows))); return out; } ================================================ FILE: rcnn/cuda_utils.h ================================================ #pragma once #include #include #include #define CUDA_ALIGN 256 template inline size_t get_size_aligned(size_t num_elem) { size_t size = num_elem * sizeof(T); size_t extra_align = 0; if (size % CUDA_ALIGN != 0) { extra_align = CUDA_ALIGN - size % CUDA_ALIGN; } return size + extra_align; } template inline T *get_next_ptr(size_t num_elem, void *&workspace, size_t &workspace_size) { size_t size = get_size_aligned(num_elem); if (size > workspace_size) { throw std::runtime_error("Workspace is too small!"); } workspace_size -= size; T *ptr = reinterpret_cast(workspace); workspace = reinterpret_cast(reinterpret_cast(workspace) + size); return ptr; } #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK ================================================ FILE: rcnn/gen_wts.py ================================================ from detectron2.layers import Conv2d from torch import nn import torch import numpy as np import struct def fuse_conv_and_bn(conv): # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ bn = conv.norm # init fusedconv = nn.Conv2d(conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, groups=conv.groups, bias=True).requires_grad_(False).to(conv.weight.device) # prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) # prepare spatial bias b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv def fuse_bn(model): for child_name, child in model.named_children(): if isinstance(child, Conv2d) and child.norm is not None: setattr(model, child_name, fuse_conv_and_bn(child)) else: fuse_bn(child) def gen_wts(model, filename): f = open('./' + filename + '.wts', 'w') f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') f.close() # construct model from detectron2.config import get_cfg from detectron2.modeling import build_model from detectron2.checkpoint import DetectionCheckpointer cfg = get_cfg() cfg.merge_from_file('./configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml') cfg.MODEL.WEIGHTS = './model_final_721ade.pkl' cfg.MODEL.DEVICE = 'cpu' model = build_model(cfg) DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) model.eval() fuse_bn(model) gen_wts(model, 'faster') # test data # from detectron2.data.detection_utils import read_image # from detectron2.data import transforms as T # import cv2 # original_image = cv2.imread('./demo.jpg') # original_image = original_image.astype('float32') # transform_gen = T.ResizeShortestEdge( # [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST # ) # height, width = original_image.shape[:2] # image = transform_gen.get_transform(original_image).apply_image(original_image) # image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) # # model test # inputs = {"image": image, "height": height, "width": width} # with torch.no_grad(): # predictions = model([inputs])[0] # print (predictions) ================================================ FILE: rcnn/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include "macros.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: explicit Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: rcnn/macros.h ================================================ #pragma once #include #include #if CUDA_VERSION >=11000 #define CUDA_11 #endif #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: rcnn/rcnn.cpp ================================================ #include #include #include "backbone.hpp" #include "RpnDecodePlugin.h" #include "RpnNmsPlugin.h" #include "RoiAlignPlugin.h" #include "PredictorDecodePlugin.h" #include "BatchedNmsPlugin.h" #include "MaskRcnnInferencePlugin.h" #include "calibrator.hpp" #define DEVICE 0 #define BATCH_SIZE 1 #define BACKBONE_RESNETTYPE R50 // data static const std::vector PIXEL_MEAN = { 103.53, 116.28, 123.675 }; static const std::vector PIXEL_STD = {1.0, 1.0, 1.0}; static constexpr float MIN_SIZE = 800.0; static constexpr float MAX_SIZE = 1333.0; static constexpr int NUM_CLASSES = 80; static int INPUT_H; // size of model input static int INPUT_W; static constexpr int INPUT_H_ = 480; // size of original image, you can change it to arbitrary size static constexpr int INPUT_W_ = 640; static int X_LEFT_PAD; // pad in preprocessImg static int X_RIGHT_PAD; static int Y_TOP_PAD; static int Y_BOTTOM_PAD; static int h_ori; // used when h_ori is not equal to INPUT_H_ static int w_ori; // backbone static const int RES2_OUT_CHANNELS = (BACKBONE_RESNETTYPE == R18 || BACKBONE_RESNETTYPE == R34) ? 64 : 256; // rpn static const std::vector ANCHOR_SIZES = { 32, 64, 128, 256, 512 }; static const std::vector ASPECT_RATIOS = { 0.5, 1.0, 2.0 }; static constexpr int PRE_NMS_TOP_K_TEST = 6000; static constexpr float RPN_NMS_THRESH = 0.7; static constexpr int POST_NMS_TOPK = 1000; // roialign static constexpr int STRIDES = 16; static constexpr int SAMPLING_RATIO = 0; static constexpr int POOLER_RESOLUTION = 14; // roihead static constexpr float NMS_THRESH_TEST = 0.5; static constexpr int DETECTIONS_PER_IMAGE = 100; static constexpr float SCORE_THRESH = 0.6; static const std::vector BBOX_REG_WEIGHTS = { 10.0, 10.0, 5.0, 5.0 }; static bool MASK_ON = false; static const char* INPUT_NODE_NAME = "images"; static const std::vector OUTPUT_NAMES = { "scores", "boxes", "labels", "masks" }; //nms methods selection in the second stage // 0: original nms // 1: soft-nms (linear) // 2: soft-nms (gaussian) static int NMS_METHOD = 1; static std::vector NMS_METHOD_VEC = {0, 1, 2}; std::vector GenerateAnchors(const std::vector& anchor_sizes, const std::vector& aspect_ratios) { std::vector res; for (auto as : anchor_sizes) { float area = as * as; for (auto ar : aspect_ratios) { float w = sqrt(area / ar); float h = ar * w; res.push_back(-w / 2.0); res.push_back(-h / 2.0); res.push_back(w / 2.0); res.push_back(h / 2.0); } } return res; } // transpose && resize && normalization && padding ITensor* DataPreprocess(INetworkDefinition *network, ITensor& input) { // HWC->CHW auto channel_permute = network->addShuffle(input); assert(channel_permute); channel_permute->setFirstTranspose(Permutation{ 2, 0, 1 }); // sub pixel mean auto pixel_mean = network->addConstant(Dims3{ 3, 1, 1 }, Weights{ DataType::kFLOAT, PIXEL_MEAN.data(), 3 }); assert(pixel_mean); auto sub = network->addElementWise(*channel_permute->getOutput(0), *pixel_mean->getOutput(0), ElementWiseOperation::kSUB); assert(sub); auto pixel_std = network->addConstant(Dims3{ 3, 1, 1 }, Weights{DataType::kFLOAT, PIXEL_STD.data(), 3}); assert(pixel_std); auto div = network->addElementWise(*sub->getOutput(0), *pixel_std->getOutput(0), ElementWiseOperation::kDIV); assert(div); return div->getOutput(0); } ITensor* RPN(INetworkDefinition *network, std::map& weightMap, ITensor& features) { int num_anchors = ANCHOR_SIZES.size() * ASPECT_RATIOS.size(); int box_dim = 4; // rpn head conv auto rpn_head_conv = network->addConvolutionNd(features, features.getDimensions().d[0], DimsHW{ 3, 3 }, weightMap["proposal_generator.rpn_head.conv.weight"], weightMap["proposal_generator.rpn_head.conv.bias"]); assert(rpn_head_conv); rpn_head_conv->setStrideNd(DimsHW{ 1, 1 }); rpn_head_conv->setPaddingNd(DimsHW{ 1, 1 }); auto rpn_head_relu = network->addActivation(*rpn_head_conv->getOutput(0), ActivationType::kRELU); assert(rpn_head_relu); // objectness logits auto rpn_head_logits = network->addConvolutionNd(*rpn_head_relu->getOutput(0), num_anchors, DimsHW{ 1, 1 }, weightMap["proposal_generator.rpn_head.objectness_logits.weight"], weightMap["proposal_generator.rpn_head.objectness_logits.bias"]); assert(rpn_head_logits); rpn_head_logits->setStrideNd(DimsHW{ 1, 1 }); // anchor deltas auto rpn_head_deltas = network->addConvolutionNd(*rpn_head_relu->getOutput(0), num_anchors * box_dim, DimsHW{ 1, 1 }, weightMap["proposal_generator.rpn_head.anchor_deltas.weight"], weightMap["proposal_generator.rpn_head.anchor_deltas.bias"]); assert(rpn_head_deltas); auto rpn_head_deltas_dim = rpn_head_deltas->getOutput(0)->getDimensions(); rpn_head_deltas->setStrideNd(DimsHW{ 1, 1 }); auto anchors = GenerateAnchors(ANCHOR_SIZES, ASPECT_RATIOS); auto rpnDecodePlugin = RpnDecodePlugin(PRE_NMS_TOP_K_TEST, anchors, STRIDES, INPUT_H, INPUT_W); std::vector faster_decode_inputs = { rpn_head_logits->getOutput(0), rpn_head_deltas->getOutput(0) }; auto rpnDecodeLayer = network->addPluginV2(faster_decode_inputs.data(), faster_decode_inputs.size(), rpnDecodePlugin); std::vector nms_input = { rpnDecodeLayer->getOutput(0), rpnDecodeLayer->getOutput(1) }; // nms auto nmsPlugin = RpnNmsPlugin(RPN_NMS_THRESH, POST_NMS_TOPK); auto nmsLayer = network->addPluginV2(nms_input.data(), nms_input.size(), nmsPlugin); return nmsLayer->getOutput(0); } ITensor* SharedRoiTransform(INetworkDefinition *network, std::map& weightMap, ITensor* proposals, ITensor* features, int num_proposals) { std::vector roi_inputs = { proposals, features }; auto roiAlignPlugin = RoiAlignPlugin(POOLER_RESOLUTION, 1 / static_cast(STRIDES), SAMPLING_RATIO, num_proposals, features->getDimensions().d[0]); auto roiAlignLayer = network->addPluginV2(roi_inputs.data(), roi_inputs.size(), roiAlignPlugin); // res5 /* same with https://github.com/facebookresearch/detectron2/ blob/9246ebc3af1c023cfbdae77e5d976edbcf9a2933/detectron2/modeling/roi_heads/roi_heads.py#L430, use bottleneck here, so pass R50*/ auto box_features = MakeStage(network, weightMap, "roi_heads.res5", *roiAlignLayer->getOutput(0), 3, R50, roiAlignLayer->getOutput(0)->getDimensions().d[1], 512, RES2_OUT_CHANNELS * 8, 2); return box_features; } void BoxHead(INetworkDefinition *network, std::map& weightMap, ITensor* proposals, ITensor* features, std::vector& instances) { auto box_features = SharedRoiTransform(network, weightMap, proposals, features, POST_NMS_TOPK); auto box_features_mean = network->addReduce(*box_features, ReduceOperation::kAVG, 12, true); // score auto scores = network->addFullyConnected(*box_features_mean->getOutput(0), NUM_CLASSES + 1, weightMap["roi_heads.box_predictor.cls_score.weight"], weightMap["roi_heads.box_predictor.cls_score.bias"]); auto probs = network->addSoftMax(*scores->getOutput(0)); auto probs_dim = probs->getOutput(0)->getDimensions(); auto score_slice = network->addSlice(*probs->getOutput(0), Dims4{ 0, 0, 0, 0 }, Dims4{ probs_dim.d[0], probs_dim.d[1] - 1, 1, 1 }, Dims4{ 1, 1, 1, 1 }); auto proposal_deltas = network->addFullyConnected(*box_features_mean->getOutput(0), NUM_CLASSES * 4, weightMap["roi_heads.box_predictor.bbox_pred.weight"], weightMap["roi_heads.box_predictor.bbox_pred.bias"]); // decode std::vector predictorDecodeInput = { score_slice->getOutput(0), proposal_deltas->getOutput(0), proposals }; auto predictorDecodePlugin = PredictorDecodePlugin(probs_dim.d[0], INPUT_H, INPUT_W, BBOX_REG_WEIGHTS); auto predictorDecodeLayer = network->addPluginV2(predictorDecodeInput.data(), predictorDecodeInput.size(), predictorDecodePlugin); // nms std::vector nmsInput = { predictorDecodeLayer->getOutput(0), predictorDecodeLayer->getOutput(1), predictorDecodeLayer->getOutput(2) }; auto batchedNmsPlugin = BatchedNmsPlugin(NMS_METHOD, NMS_THRESH_TEST, DETECTIONS_PER_IMAGE); auto batchedNmsLayer = network->addPluginV2(nmsInput.data(), nmsInput.size(), batchedNmsPlugin); // instances instances.push_back(batchedNmsLayer->getOutput(0)); instances.push_back(batchedNmsLayer->getOutput(1)); instances.push_back(batchedNmsLayer->getOutput(2)); } void MaskHead(INetworkDefinition *network, std::map& weightMap, ITensor* features, std::vector& instances, int out_channels = 256) { auto mask_features = SharedRoiTransform(network, weightMap, instances[1], features, DETECTIONS_PER_IMAGE); // mask_fcn auto mask_deconv = network->addDeconvolutionNd(*mask_features, out_channels, DimsHW{ 2, 2 }, weightMap["roi_heads.mask_head.deconv.weight"], weightMap["roi_heads.mask_head.deconv.bias"]); mask_deconv->setStrideNd(DimsHW{ 2, 2 }); auto deconv_relu = network->addActivation(*mask_deconv->getOutput(0), ActivationType::kRELU); assert(deconv_relu); auto predictor = network->addConvolutionNd(*deconv_relu->getOutput(0), NUM_CLASSES, DimsHW{ 1, 1 }, weightMap["roi_heads.mask_head.predictor.weight"], weightMap["roi_heads.mask_head.predictor.bias"]); predictor->setStrideNd(DimsHW{ 1, 1 }); ITensor* masks; if (NUM_CLASSES == 1) { auto mask_probs_pred = network->addActivation(*predictor->getOutput(0), ActivationType::kSIGMOID); masks = mask_probs_pred->getOutput(0); } else { std::vector mask_rcnn_inference_inputs = { instances[2], predictor->getOutput(0) }; auto maskRcnnInferencePlugin = MaskRcnnInferencePlugin(DETECTIONS_PER_IMAGE, POOLER_RESOLUTION); auto maskRcnnInferenceLayer = network->addPluginV2(mask_rcnn_inference_inputs.data(), mask_rcnn_inference_inputs.size(), maskRcnnInferencePlugin); masks = maskRcnnInferenceLayer->getOutput(0); } instances.push_back(masks); } std::vector ROIHeads(INetworkDefinition *network, std::map& weightMap, ITensor* proposals, ITensor* features) { std::vector instances; // box head BoxHead(network, weightMap, proposals, features, instances); if (MASK_ON) { // mask head MaskHead(network, weightMap, features, instances); } return instances; } ICudaEngine* createEngine_rcnn(unsigned int maxBatchSize, const std::string& wtsfile, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& quantizationType) { /* description: after fuse bn */ INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {INPUT_H, INPUT_W, 3} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_NODE_NAME, dt, Dims3{ INPUT_H, INPUT_W, 3 }); assert(data); // preprocess data = DataPreprocess(network, *data); std::map weightMap; loadWeights(wtsfile, weightMap); // backbone ITensor* features = BuildResNet(network, weightMap, *data, BACKBONE_RESNETTYPE, 64, 64, RES2_OUT_CHANNELS); auto proposals = RPN(network, weightMap, *features); auto results = ROIHeads(network, weightMap, proposals, features); // build output for (int i = 0; i < results.size(); i++) { network->markOutput(*results[i]); results[i]->setName(OUTPUT_NAMES[i].c_str()); } // build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1ULL << 30); if (quantizationType == "fp32") { } else if (quantizationType == "fp16") { config->setFlag(BuilderFlag::kFP16); } else if (quantizationType == "int8") { std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_NODE_NAME); config->setInt8Calibrator(calibrator); } else { throw("does not support model type"); } std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // destroy network network->destroy(); // Release host memory for (auto& mem : weightMap) { delete[] mem.second.values; } return engine; } void BuildRcnnModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& wtsfile, const std::string& quantizationType = "fp32") { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); ICudaEngine* engine = createEngine_rcnn(maxBatchSize, wtsfile, builder, config, DataType::kFLOAT, quantizationType); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, cudaStream_t& stream, std::vector& buffers, std::vector& input, std::vector& output) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input.data(), BATCH_SIZE * INPUT_H * INPUT_W * 3 * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(BATCH_SIZE, buffers.data(), stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output[0], buffers[1], BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(output[1], buffers[2], BATCH_SIZE * DETECTIONS_PER_IMAGE * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(output[2], buffers[3], BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float), cudaMemcpyDeviceToHost, stream)); if (MASK_ON) CUDA_CHECK(cudaMemcpyAsync(output[3], buffers[4], BATCH_SIZE * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void calculateSize() { float ratio = MIN_SIZE / static_cast(std::min(INPUT_H_, INPUT_W_)); float newh = 0, neww = 0; if (INPUT_H_ < INPUT_W_) { newh = MIN_SIZE; neww = ratio * INPUT_W_; } else { newh = ratio * INPUT_H_; neww = MIN_SIZE; } if (std::max(newh, neww) > MAX_SIZE) { ratio = MAX_SIZE / static_cast(std::max(newh, neww)); newh = newh * ratio; neww = neww * ratio; } INPUT_H = static_cast(newh + 0.5); INPUT_W = static_cast(neww + 0.5); } bool parse_args(int argc, char** argv, std::string& wtsFile, std::string& engineFile, std::string& imgDir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s") { wtsFile = std::string(argv[2]); engineFile = std::string(argv[3]); } else if (std::string(argv[1]) == "-d") { engineFile = std::string(argv[2]); imgDir = std::string(argv[3]); } else { return false; } if (argc >= 5 && std::string(argv[4]) == "m") MASK_ON = true; return true; } int main(int argc, char** argv) { int flag = 0; for (int &item : NMS_METHOD_VEC) { if (item == NMS_METHOD) { flag = 1; printf("The nms method %d is applied.\n", NMS_METHOD); break; } } if (flag == 0) { printf("[WARNING] The nms_method %d is not supported, please choose from [0, 1, 2].\n", NMS_METHOD); printf("[WARNING] To make the nms robust, the default nms method 0 is applied.\n"); NMS_METHOD = 0; } // calculate size calculateSize(); cudaSetDevice(DEVICE); std::string wtsFile = ""; std::string engineFile = ""; std::string imgDir; if (!parse_args(argc, argv, wtsFile, engineFile, imgDir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./rcnn -s [.wts] [.engine] [m] // serialize model to plan file" << std::endl; std::cerr << "./rcnn -d [.engine] ../samples [m] // deserialize plan file and run inference" << std::endl; return -1; } if (!wtsFile.empty()) { IHostMemory* modelStream{ nullptr }; BuildRcnnModel(BATCH_SIZE, &modelStream, wtsFile, "fp32"); assert(modelStream != nullptr); std::ofstream p(engineFile, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } // deserialize the .engine and run inference std::ifstream file(engineFile, std::ios::binary); if (!file.good()) { std::cerr << "read " << engineFile << " error!" << std::endl; return -1; } std::string trtModelStream; size_t modelSize{ 0 }; file.seekg(0, file.end); modelSize = file.tellg(); file.seekg(0, file.beg); trtModelStream.resize(modelSize); assert(!trtModelStream.empty()); file.read(const_cast(trtModelStream.c_str()), modelSize); file.close(); // build engine std::cout << "build engine" << std::endl; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream.c_str(), modelSize); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); runtime->destroy(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // prepare input file std::vector fileList; if (read_files_in_dir(imgDir.c_str(), fileList) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data std::vector data(BATCH_SIZE * INPUT_H * INPUT_W * 3, 0); void *data_d, *scores_d, *boxes_d, *classes_d, *masks_d; CUDA_CHECK(cudaMalloc(&data_d, BATCH_SIZE * INPUT_H * INPUT_W * 3 * sizeof(float))); CUDA_CHECK(cudaMalloc(&scores_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float))); CUDA_CHECK(cudaMalloc(&boxes_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * 4 * sizeof(float))); CUDA_CHECK(cudaMalloc(&classes_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * sizeof(float))); std::vector scores_h(BATCH_SIZE * DETECTIONS_PER_IMAGE); std::vector boxes_h(BATCH_SIZE * DETECTIONS_PER_IMAGE * 4); std::vector classes_h(BATCH_SIZE * DETECTIONS_PER_IMAGE); std::vector masks_h; std::vector buffers = { data_d, scores_d, boxes_d, classes_d }; std::vector outputs = {scores_h.data(), boxes_h.data(), classes_h.data()}; if (MASK_ON) { CUDA_CHECK(cudaMalloc(&masks_d, BATCH_SIZE * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION * sizeof(float))); masks_h.resize(BATCH_SIZE * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION); buffers.push_back(masks_d); outputs.push_back(masks_h.data()); } int fcount = 0; int fileLen = fileList.size(); for (int f = 0; f < fileLen; f++) { fcount++; if (fcount < BATCH_SIZE && f + 1 != fileLen) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]); h_ori = img.rows; w_ori = img.cols; img = preprocessImg(img, INPUT_W, INPUT_H, X_LEFT_PAD, X_RIGHT_PAD, Y_TOP_PAD, Y_BOTTOM_PAD); if (img.empty()) continue; for (int i = 0; i < INPUT_H * INPUT_W * 3; i++) data[b*INPUT_H * INPUT_W * 3 + i] = static_cast(*(img.data + i)); } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, stream, buffers, data, outputs); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; float h_ratio = static_cast(h_ori) / (INPUT_H - (Y_TOP_PAD + Y_BOTTOM_PAD)); // ratio of original image size to model input size float w_ratio = static_cast(w_ori) / (INPUT_W - (X_LEFT_PAD + X_RIGHT_PAD)); for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(imgDir + "/" + fileList[f - fcount + 1 + b]); for (int i = 0; i < DETECTIONS_PER_IMAGE; i++) { if (scores_h[b * DETECTIONS_PER_IMAGE + i] > SCORE_THRESH) { float x1 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 0] - X_LEFT_PAD) * w_ratio; float y1 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 1] - Y_TOP_PAD) * h_ratio; float x2 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 2] - X_LEFT_PAD) * w_ratio; float y2 = (boxes_h[b * DETECTIONS_PER_IMAGE * 4 + i * 4 + 3] - Y_TOP_PAD) * h_ratio; int label = classes_h[b * DETECTIONS_PER_IMAGE + i]; float score = scores_h[b * DETECTIONS_PER_IMAGE + i]; printf("boxes:[%.6f, %.6f, %.6f, %.6f] scores: %.4f label: %d \n", x1, y1, x2, y2, score, label); cv::Rect r(x1, y1, x2 - x1, y2 - y1); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string(label), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); if (MASK_ON) { cv::Mat maskPart = cv::Mat::zeros(cv::Size(POOLER_RESOLUTION, POOLER_RESOLUTION), CV_32FC1); memcpy(maskPart.data, &masks_h[b * DETECTIONS_PER_IMAGE * POOLER_RESOLUTION * POOLER_RESOLUTION + i * POOLER_RESOLUTION * POOLER_RESOLUTION], POOLER_RESOLUTION * POOLER_RESOLUTION * sizeof(float)); cv::Rect r(cv::Point(floor(x1) - 1 < 0 ? 0 : floor(x1) - 1, floor(y1) - 1 < 0 ? 0 : floor(y1) - 1), cv::Point(ceil(x2) + 1 > w_ori ? w_ori : ceil(x2) + 1, ceil(y2) + 1 > h_ori ? h_ori : ceil(y2) + 1)); cv::resize(maskPart, maskPart, cv::Size(r.width, r.height)); cv::Mat curMask = cv::Mat::zeros(cv::Size(w_ori, h_ori), CV_8UC1); cv::threshold(maskPart, maskPart, 0.5, 255, cv::THRESH_BINARY); curMask(r) += maskPart; std::vector> contours; cv::findContours(curMask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_NONE); for (int c = 0; c < contours.size(); c++) cv::drawContours(img, contours, c, cv::Scalar(0, 0, 255)); } } } cv::imwrite("_" + fileList[f - fcount + 1 + b], img); } fcount = 0; } cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(data_d)); CUDA_CHECK(cudaFree(scores_d)); CUDA_CHECK(cudaFree(boxes_d)); CUDA_CHECK(cudaFree(classes_d)); if (MASK_ON) CUDA_CHECK(cudaFree(masks_d)); context->destroy(); engine->destroy(); return 0; } ================================================ FILE: real-esrgan/general-x4v3/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.16) project(real-esrgan) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") add_definitions(-std=c++17) add_definitions(-DAPI_EXPORTS) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) #set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) #find_package(CUDA REQUIRED) INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src/include) # cuda FIND_PACKAGE(CUDA REQUIRED) #INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # <------------------------TensorRT Related-------------------------> include_directories(YOUR_TENSORRT_INCLUDE_DIR) # TensorRT-8.6.1.6/include link_directories(YOUR_TENSORRT_LIB_DIR) # TensorRT-8.6.1.6/lib # <------------------------OpenCV Related-------------------------> # opencv FIND_PACKAGE(OpenCV REQUIRED) INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS}) set(CMAKE_CXX_STANDARD 17) add_executable(${PROJECT_NAME} main.cpp) cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/src/pixel_shuffle/pixel_shuffle.cu) target_link_libraries(myplugins nvinfer cudart) TARGET_LINK_LIBRARIES(${PROJECT_NAME} nvinfer) TARGET_LINK_LIBRARIES(${PROJECT_NAME} cudart) TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${OpenCV_LIBS}) TARGET_LINK_LIBRARIES(${PROJECT_NAME} myplugins) ================================================ FILE: real-esrgan/general-x4v3/README.md ================================================ # Real-ESRGAN realesr-general-x4v3 model ## How to Run 0. Replace YOUR_TENSORRT_INCLUDE_DIR and YOUR_TENSORRT_LIB_DIR in CMakeLists.txt with your TensorRT include and lib directories. 1. generate .wts from pytorch with .pt ``` git clone https://github.com/xinntao/Real-ESRGAN.git cd Real-ESRGAN # Install basicsr - https://github.com/xinntao/BasicSR # We use BasicSR for both training and inference pip install basicsr # facexlib and gfpgan are for face enhancement pip install facexlib pip install gfpgan pip install -r requirements.txt python setup.py develop ``` download realesr-general-x4v3.pth (and realesr-general-wdn-x4v3.pth if needed) from https://github.com/xinntao/Real-ESRGAN/releases ``` cp {tensorrtx}/real-esrgan-general-x4v3/gen_wts.py {xinntao}/Real-ESRGAN cd {xinntao}/Real-ESRGAN python gen_wts.py // a file 'real-esrgan.wts' will be generated. ``` **Be aware that if you need both realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth, please write a Python script to average all weights of realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth (from {xinntao}/Real-ESRGAN), then save it as a .pth file, and use this new file to generate a .wts file.** 2. build tensorrtx/real-esrgan-general-x4v3 and run ``` cd {tensorrtx}/real-esrgan-general-x4v3/ mkdir build cd build cp {xinntao}/Real-ESRGAN/real-esrgan.wts {tensorrtx}/real-esrgan/weights/ cmake .. make ./real-esrgan your_images_dir ``` ================================================ FILE: real-esrgan/general-x4v3/cmake/FindTensorRT.cmake ================================================ # source: # https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake # This module defines the following variables: # # :: # # TensorRT_INCLUDE_DIRS # TensorRT_LIBRARIES # TensorRT_FOUND # # :: # # TensorRT_VERSION_STRING - version (x.y.z) # TensorRT_VERSION_MAJOR - major version (x) # TensorRT_VERSION_MINOR - minor version (y) # TensorRT_VERSION_PATCH - patch version (z) # # Hints # ^^^^^ # A user may set ``TensorRT_DIR`` to an installation root to tell this module where to look. # set(_TensorRT_SEARCHES) if(TensorRT_DIR) set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_DIR} NO_DEFAULT_PATH) list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT) endif() # appends some common paths set(_TensorRT_SEARCH_NORMAL PATHS "/usr" ) list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL) # Include dir foreach(search ${_TensorRT_SEARCHES}) find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include) endforeach() if(NOT TensorRT_LIBRARY) foreach(search ${_TensorRT_SEARCHES}) find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib) endforeach() endif() if(NOT TensorRT_PARSERS_LIBRARY) foreach(search ${_TensorRT_SEARCHES}) find_library(TensorRT_NVPARSERS_LIBRARY NAMES nvparsers ${${search}} PATH_SUFFIXES lib) endforeach() endif() if(NOT TensorRT_NVONNXPARSER_LIBRARY) foreach(search ${_TensorRT_SEARCHES}) find_library(TensorRT_NVONNXPARSER_LIBRARY NAMES nvonnxparser ${${search}} PATH_SUFFIXES lib) endforeach() endif() mark_as_advanced(TensorRT_INCLUDE_DIR) if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h") file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}") string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}") set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}") endif() include(FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING) if(TensorRT_FOUND) set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR}) if(NOT TensorRT_LIBRARIES) set(TensorRT_LIBRARIES ${TensorRT_LIBRARY} ${TensorRT_NVONNXPARSER_LIBRARY} ${TensorRT_NVPARSERS_LIBRARY}) endif() if(NOT TARGET TensorRT::TensorRT) add_library(TensorRT::TensorRT UNKNOWN IMPORTED) set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}") set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}") endif() endif() ================================================ FILE: real-esrgan/general-x4v3/gen_wts.py ================================================ import argparse import os import struct from realesrgan import RealESRGANer from realesrgan.archs.srvgg_arch import SRVGGNetCompact from basicsr.archs.rrdbnet_arch import RRDBNet from basicsr.utils.download_util import load_file_from_url def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str, help='Input image or folder') parser.add_argument( '-n', '--model_name', type=str, default='realesr-general-x4v3', help=('RealESRGAN_x2plus Model names: ' 'realesr-animevideov3 | realesr-general-x4v3')) parser.add_argument('-o', '--output', type=str, help='Output folder') parser.add_argument( '-dn', '--denoise_strength', type=float, default=0.5, help=('Denoise strength. 0 for weak denoise (keep noise), 1 for strong denoise ability. ' 'Only used for the realesr-general-x4v3 model')) parser.add_argument('-s', '--outscale', type=float, default=4, help='The final upsampling scale of the image') parser.add_argument( '--model_path', type=str, default=None, help='[Option] Model path. Usually, you do not need to specify it') parser.add_argument('--suffix', type=str, default='out', help='Suffix of the restored image') parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing') parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding') parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border') parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face') parser.add_argument( '--fp32', action='store_true', help='Use fp32 precision during inference. Default: fp16 (half precision).') parser.add_argument( '--alpha_upsampler', type=str, default='realesrgan', help='The upsampler for the alpha channels. Options: realesrgan | bicubic') parser.add_argument( '--ext', type=str, default='auto', help='Image extension. Options: auto | jpg | png, auto means using the same extension as inputs') parser.add_argument( '-g', '--gpu-id', type=int, default=None, help='gpu device to use (default=None) can be 0,1,2 for multi-gpu') args = parser.parse_args() # determine models according to model names args.model_name = args.model_name.split('.')[0] if args.model_name == 'RealESRGAN_x4plus': # x4 RRDBNet model model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4) netscale = 4 file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth'] elif args.model_name == 'RealESRNet_x4plus': # x4 RRDBNet model model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4) netscale = 4 file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.1/RealESRNet_x4plus.pth'] elif args.model_name == 'RealESRGAN_x4plus_anime_6B': # x4 RRDBNet model with 6 blocks model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4) netscale = 4 file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth'] elif args.model_name == 'RealESRGAN_x2plus': # x2 RRDBNet model model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2) netscale = 2 file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth'] elif args.model_name == 'realesr-animevideov3': # x4 VGG-style model (XS size) model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu') netscale = 4 file_url = ['https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth'] elif args.model_name == 'realesr-general-x4v3': # x4 VGG-style model (S size) model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu') netscale = 4 file_url = [ 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth', 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth' ] # determine model paths if args.model_path is not None: model_path = args.model_path else: model_path = os.path.join('weights', args.model_name + '.pth') if not os.path.isfile(model_path): ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) for url in file_url: # model_path will be updated model_path = load_file_from_url( url=url, model_dir=os.path.join(ROOT_DIR, 'weights'), progress=True, file_name=None) # use dni to control the denoise strength dni_weight = None if args.model_name == 'realesr-general-x4v3' and args.denoise_strength != 1: # wdn_model_path = model_path.replace('realesr-general-x4v3', 'realesr-general-wdn-x4v3') # model_path = [model_path, wdn_model_path] # dni_weight = [args.denoise_strength, 1 - args.denoise_strength] model_path = model_path.replace('realesr-general-x4v3', 'realesr-general-x4v3-cat') dni_weight = None # restorer upsampler = RealESRGANer( scale=netscale, model_path=model_path, dni_weight=dni_weight, model=model, tile=args.tile, tile_pad=args.tile_pad, pre_pad=args.pre_pad, half=not args.fp32, gpu_id=args.gpu_id) if os.path.isfile('real-esrgan.wts'): print('Already, real-esrgan.wts file exists.') else: print('making real-esrgan.wts file ...') f = open("real-esrgan.wts", 'w') f.write("{}\n".format(len(upsampler.model.state_dict().keys()))) for k, v in upsampler.model.state_dict().items(): print('key: ', k) print('value: ', v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") print('Completed real-esrgan.wts file!') if __name__ == '__main__': main() ================================================ FILE: real-esrgan/general-x4v3/main.cpp ================================================ #include #include #include #include #include #include #include #include "config/config.hpp" #include "cuda_utils.h" #include "logging/logging.h" #include "pixel_shuffle/pixel_shuffle.hpp" #include "preprocess/preprocess.hpp" static Logger gLogger; using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } auto* ConvPRelu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int conv_nb, int index) { IConvolutionLayer* conv = network->addConvolutionNd(input, conv_nb, DimsHW{3, 3}, weightMap["body." + std::to_string(index) + ".weight"], weightMap["body." + std::to_string(index) + ".bias"]); assert(conv); conv->setName(("body." + std::to_string(index) + ".weight").c_str()); conv->setStrideNd(DimsHW{1, 1}); conv->setPaddingNd(DimsHW{1, 1}); auto conv_res = conv->getOutput(0); // add prelu layer // slope 64 number //auto slope = network->addConstant( {64}, weightMap["body." + std::to_string(index + 1) + ".weight"] ); auto slope = network->addConstant(Dims4{1, 64, 1, 1}, weightMap["body." + std::to_string(index + 1) + ".weight"]); assert(slope); slope->setName(("body." + std::to_string(index + 1) + ".weight").c_str()); auto prelu = network->addParametricReLU(*conv_res, *slope->getOutput(0)); assert(prelu); return prelu; } void build_engine(DataType dt, std::string& wts_path) { std::map weightMap = loadWeights(wts_path); nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(gLogger); nvinfer1::IBuilderConfig* config = builder->createBuilderConfig(); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1U); auto data = network->addInput(INPUT_BLOB_NAME, nvinfer1::DataType::kFLOAT, nvinfer1::Dims4{BATCH_SIZE, INPUT_C, INPUT_H, INPUT_W}); // first auto layer = ConvPRelu(network, weightMap, *data, 64, 0); for (int i = 0; i < 32; ++i) { layer = ConvPRelu(network, weightMap, *layer->getOutput(0), 64, 2 * i + 2); } auto conv_last = network->addConvolutionNd(*layer->getOutput(0), 48, DimsHW{3, 3}, weightMap["body.66.weight"], weightMap["body.66.bias"]); assert(conv_last); conv_last->setName("body.66.weight"); conv_last->setStrideNd(DimsHW{1, 1}); conv_last->setPaddingNd(DimsHW{1, 1}); auto conv_last_res = conv_last->getOutput(0); // add pixel shuffle layer by plugin IPluginCreator* creator = getPluginRegistry()->getPluginCreator("PixelShufflePlugin", "1"); const PluginFieldCollection* pluginFC = creator->getFieldNames(); std::vector pluginData; int upscaleFactor = 4; pluginData.emplace_back(PluginField{"upscaleFactor", &upscaleFactor, PluginFieldType::kINT32, 1}); PluginFieldCollection pluginFCWithData = {static_cast(pluginData.size()), pluginData.data()}; auto pluginObj = creator->createPlugin("PixelShuffle", &pluginFCWithData); auto pixelShuffleLayer = network->addPluginV2(&conv_last_res, 1, *pluginObj); // the input "data" interpolate 4x and add to pixelShuffleLayer->getOutput(0) auto interpolateLayer = network->addResize(*data); interpolateLayer->setResizeMode(ResizeMode::kNEAREST); // Define scale factors float scales[] = {1.0f, 1.0f, 1.0 * OUT_SCALE, 1.0 * OUT_SCALE}; // scale_factor=4 for height and width interpolateLayer->setScales(scales, OUT_SCALE); // Add the two tensor as output auto addLayer = network->addElementWise(*interpolateLayer->getOutput(0), *pixelShuffleLayer->getOutput(0), ElementWiseOperation::kSUM); // output addLayer->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*addLayer->getOutput(0)); // fp16 if (USE_FP16) { config->setFlag(BuilderFlag::kFP16); } std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } std::ofstream ofs("../weights/real-esrgan.engine", std::ios::binary); assert(serialized_model != nullptr); ofs.write(reinterpret_cast(serialized_model->data()), serialized_model->size()); delete config; delete serialized_model; delete builder; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } void doInference(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output) { context.setBindingDimensions(0, Dims4(BATCH_SIZE, INPUT_C, INPUT_H, INPUT_W)); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); } int main(int argc, char** argv) { std::string img_dir; if (argc < 2) { std::cerr << "Usage: " << argv[0] << " " << std::endl; return -1; } else { img_dir = argv[1]; } std::string wts_path = "../weights/real-esrgan.wts"; build_engine(DataType::kFLOAT, wts_path); std::string engine_name = "../weights/real-esrgan.engine"; // deserialize the .engine and run inference std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; return -1; } char* trtModelStream = nullptr; size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; assert(engine->getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); std::vector data; std::vector output; //std::vector res; //data.resize(BATCH_SIZE * 3 * INPUT_H * INPUT_W); data.resize(BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W); output.resize(BATCH_SIZE * OUTPUT_SIZE); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } for (int index = 0; index < file_names.size(); ++index) { auto img = cv::imread(img_dir + "/" + file_names[index]); auto begin = std::chrono::high_resolution_clock::now(); // BATCH_SIZE = 1 for (int b = 0; b < BATCH_SIZE; b++) { int i = 0; for (int row = 0; row < INPUT_H; ++row) { uchar* uc_pixel = img.data + row * img.step; for (int col = 0; col < INPUT_W; ++col) { // static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; // BGR2RGB and normalization data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0; data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0; data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0; uc_pixel += 3; ++i; } } } CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], data.data(), BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); doInference(*context, stream, (void**)buffers, output.data()); auto end = std::chrono::high_resolution_clock::now(); std::cout << "Inference time: " << std::chrono::duration_cast(end - begin).count() << " ms" << std::endl; int OUTPUT_C = 3; int OUTPUT_H = INPUT_H * OUT_SCALE; int OUTPUT_W = INPUT_W * OUT_SCALE; for (int b = 0; b < BATCH_SIZE; b++) { cv::Mat img_res(OUTPUT_H, OUTPUT_W, CV_8UC3); int i = 0; for (int row = 0; row < OUTPUT_H; ++row) { uchar* uc_pixel = img_res.data + row * img_res.step; for (int col = 0; col < OUTPUT_W; ++col) { // RGB2BGR and de_normalization auto r2 = std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i] * 255.0); if (r2 < 0) r2 = 0; if (r2 > 255) r2 = 255; auto g2 = std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 1 * OUTPUT_H * OUTPUT_W] * 255.0); if (g2 < 0) g2 = 0; if (g2 > 255) g2 = 255; auto b2 = std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 2 * OUTPUT_H * OUTPUT_W] * 255.0); if (b2 < 0) b2 = 0; if (b2 > 255) b2 = 255; uc_pixel[0] = static_cast(b2); // B uc_pixel[1] = static_cast(g2); // G uc_pixel[2] = static_cast(r2); // R // uc_pixel[0] = static_cast(std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 2 * OUTPUT_H * OUTPUT_W] * 255.0)); // B // uc_pixel[1] = static_cast(std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i + 1 * OUTPUT_H * OUTPUT_W] * 255.0)); // G // uc_pixel[2] = static_cast(std::round(output[b * OUTPUT_C * OUTPUT_H * OUTPUT_W + i] * 255.0)); // R uc_pixel += 3; ++i; } } cv::imwrite("_" + file_names[index] + ".jpg", img_res); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[0])); CUDA_CHECK(cudaFree(buffers[1])); // Destroy the engine delete context; delete engine; delete runtime; } ================================================ FILE: real-esrgan/general-x4v3/src/include/config/config.hpp ================================================ #ifndef REAL_ESRGAN_TRT_CONFIG_HPP #define REAL_ESRGAN_TRT_CONFIG_HPP #include //std::string INPUT_BLOB_NAME = "input"; //std::string OUTPUT_BLOB_NAME = "output"; const char* INPUT_BLOB_NAME = "input_0"; const char* OUTPUT_BLOB_NAME = "output_0"; const bool USE_FP16 = false; static const int BATCH_SIZE = 1; static const int INPUT_C = 3; static const int INPUT_H = 450; static const int INPUT_W = 300; static const int OUT_SCALE = 4; //static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE; static const int OUTPUT_SIZE = BATCH_SIZE * 48 * 450 * 300; //INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE; #endif //REAL_ESRGAN_TRT_CONFIG_HPP ================================================ FILE: real-esrgan/general-x4v3/src/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: real-esrgan/general-x4v3/src/include/logging/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) noexcept override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: real-esrgan/general-x4v3/src/include/pixel_shuffle/pixel_shuffle.hpp ================================================ #ifndef REAL_ESRGAN_TRT_PIXEL_SHUFFLE_HPP #define REAL_ESRGAN_TRT_PIXEL_SHUFFLE_HPP #include #include #include "NvInfer.h" class PixelShufflePlugin : public nvinfer1::IPluginV2DynamicExt { public: PixelShufflePlugin(int upscaleFactor) : mUpscaleFactor(upscaleFactor) {} PixelShufflePlugin(const void* data, size_t length) { memcpy(&mUpscaleFactor, data, sizeof(mUpscaleFactor)); } const char* getPluginType() const noexcept override { return "PixelShufflePlugin"; } const char* getPluginVersion() const noexcept override { return "1"; } int getNbOutputs() const noexcept override { return 1; } // nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept override // { // assert(outputIndex == 0); // auto* in = &inputs[0]; // nvinfer1::DimsExprs outputDims = *in; // int channels = in->d[0]; // int height = in->d[1]; // int width = in->d[2]; // int upscaleFactor = mUpscaleFactor; // outputDims.d[0] = exprBuilder.constant(channels / (upscaleFactor * upscaleFactor)); // outputDims.d[1] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, {height, exprBuilder.constant(upscaleFactor)}); // outputDims.d[2] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, {width, exprBuilder.constant(upscaleFactor)}); // return outputDims; // } nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept override { // assert(nbInputs == 1); auto inDims = inputs[0]; // assert(inDims.nbDims == 4); int c = inDims.d[1]->getConstantValue() / (mUpscaleFactor * mUpscaleFactor); int h = inDims.d[2]->getConstantValue() * mUpscaleFactor; int w = inDims.d[3]->getConstantValue() * mUpscaleFactor; nvinfer1::DimsExprs outDims; outDims.nbDims = 4; outDims.d[0] = inDims.d[0]; outDims.d[1] = exprBuilder.constant(c); outDims.d[2] = exprBuilder.constant(h); outDims.d[3] = exprBuilder.constant(w); return outDims; } bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept override { return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR && inOut[pos].type == nvinfer1::DataType::kFLOAT; } nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept override { return inputTypes[0]; } // bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept override // { // return false; // } void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) noexcept override {} // void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept override // { // // Optionally configure plugin if necessary // } size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept override { return 0; } size_t getSerializationSize() const noexcept override { return sizeof(mUpscaleFactor); } void serialize(void* buffer) const noexcept override { memcpy(buffer, &mUpscaleFactor, sizeof(mUpscaleFactor)); } void destroy() noexcept override { // delete this; } nvinfer1::IPluginV2DynamicExt* clone() const noexcept override { return new PixelShufflePlugin(mUpscaleFactor); } void setPluginNamespace(const char* pluginNamespace) noexcept override { mNamespace = pluginNamespace; } const char* getPluginNamespace() const noexcept override { return mNamespace.c_str(); } int initialize() noexcept override { return 0; } int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; void terminate() noexcept override {} private: int mUpscaleFactor; std::string mNamespace; }; class PixelShufflePluginCreator : public nvinfer1::IPluginCreator { public: PixelShufflePluginCreator() { mPluginAttributes.clear(); mPluginAttributes.emplace_back( nvinfer1::PluginField("upscaleFactor", nullptr, nvinfer1::PluginFieldType::kINT32, 1)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } ~PixelShufflePluginCreator() override = default; const char* getPluginName() const noexcept override { return "PixelShufflePlugin"; } const char* getPluginVersion() const noexcept override { return "1"; } const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override { return &mFC; } nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override { int upscaleFactor = 0; for (int i = 0; i < fc->nbFields; ++i) { if (strcmp(fc->fields[i].name, "upscaleFactor") == 0) { upscaleFactor = *static_cast(fc->fields[i].data); } } return new PixelShufflePlugin(upscaleFactor); } nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override { return new PixelShufflePlugin(serialData, serialLength); } void setPluginNamespace(const char* pluginNamespace) noexcept override { mNamespace = pluginNamespace; } const char* getPluginNamespace() const noexcept override { return mNamespace.c_str(); } private: static nvinfer1::PluginFieldCollection mFC; static std::vector mPluginAttributes; std::string mNamespace; }; nvinfer1::PluginFieldCollection PixelShufflePluginCreator::mFC{}; std::vector PixelShufflePluginCreator::mPluginAttributes{ nvinfer1::PluginField{"upscaleFactor", nullptr, nvinfer1::PluginFieldType::kINT32, 1}}; REGISTER_TENSORRT_PLUGIN(PixelShufflePluginCreator); #endif //REAL_ESRGAN_TRT_PIXEL_SHUFFLE_HPP ================================================ FILE: real-esrgan/general-x4v3/src/include/preprocess/preprocess.hpp ================================================ #ifndef REAL_ESRGAN_TRT_PREPROCESS_HPP #define REAL_ESRGAN_TRT_PREPROCESS_HPP struct PreprocessStruct { int N; int C; int H; int W; }; #endif //REAL_ESRGAN_TRT_PREPROCESS_HPP ================================================ FILE: real-esrgan/general-x4v3/src/pixel_shuffle/pixel_shuffle.cpp ================================================ // PixelShufflePlugin.cpp // // #include "pixel_shuffle/pixel_shuffle.hpp" // #include // #include // // PixelShufflePlugin::PixelShufflePlugin(int upscaleFactor) // : mUpscaleFactor(upscaleFactor) { // // Initialize other members // } // // PixelShufflePlugin::PixelShufflePlugin(const void* data, size_t length) { // // Deserialize data to initialize members // const char* d = static_cast(data); // mUpscaleFactor = *reinterpret_cast(d); // d += sizeof(int); // mInputVolume = *reinterpret_cast(d); // d += sizeof(size_t); // mOutputVolume = *reinterpret_cast(d); // } // // int PixelShufflePlugin::getNbOutputs() const { // return 1; // } // // nvinfer1::Dims PixelShufflePlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) { // assert(index == 0); // assert(nbInputDims == 1); // int c = inputs[0].d[0]; // int h = inputs[0].d[1]; // int w = inputs[0].d[2]; // int upscaleFactor = mUpscaleFactor; // // assert(c % (upscaleFactor * upscaleFactor) == 0); // int newC = c / (upscaleFactor * upscaleFactor); // int newH = h * upscaleFactor; // int newW = w * upscaleFactor; // // return nvinfer1::Dims3(newC, newH, newW); // } // // int PixelShufflePlugin::initialize() { // return 0; // } // // void PixelShufflePlugin::terminate() { // // Clean up // } // // size_t PixelShufflePlugin::getWorkspaceSize(int maxBatchSize) const { // return 0; // } // // int PixelShufflePlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { // // Launch CUDA kernel for PixelShuffle // // Assume inputs[0] and outputs[0] are pointers to device memory // const float* input = static_cast(inputs[0]); // float* output = static_cast(outputs[0]); // // int c = mInputVolume / (mUpscaleFactor * mUpscaleFactor); // int h = mOutputVolume / (c * mUpscaleFactor); // int w = h; // Assuming square input for simplicity // int upscaleFactor = mUpscaleFactor; // // // Launch CUDA kernel (to be implemented) // // pixelShuffleKernel(input, output, c, h, w, upscaleFactor, stream); // // return 0; // } // // size_t PixelShufflePlugin::getSerializationSize() const { // return sizeof(int) + sizeof(size_t) * 2; // } // // void PixelShufflePlugin::serialize(void* buffer) const { // char* d = static_cast(buffer); // *reinterpret_cast(d) = mUpscaleFactor; // d += sizeof(int); // *reinterpret_cast(d) = mInputVolume; // d += sizeof(size_t); // *reinterpret_cast(d) = mOutputVolume; // } // // void PixelShufflePlugin::destroy() { // delete this; // } // // const char* PixelShufflePlugin::getPluginType() const { // return "PixelShufflePlugin"; // } // // const char* PixelShufflePlugin::getPluginVersion() const { // return "1"; // } // // void PixelShufflePlugin::setPluginNamespace(const char* pluginNamespace) { // mPluginNamespace = pluginNamespace; // } // // const char* PixelShufflePlugin::getPluginNamespace() const { // return mPluginNamespace; // } // // nvinfer1::IPluginV2IOExt* PixelShufflePlugin::clone() const { // return new PixelShufflePlugin(mUpscaleFactor); // } // // bool PixelShufflePlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const { // return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR && inOut[pos].type == nvinfer1::DataType::kFLOAT; // } // // void PixelShufflePlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) { // // Configure the plugin based on the input and output descriptions // mInputVolume = in[0].desc.volume(); // mOutputVolume = out[0].desc.volume(); // } // // nvinfer1::DataType PixelShufflePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { // return inputTypes[0]; // } // // bool PixelShufflePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { // return false; // } // // bool PixelShufflePlugin::canBroadcastInputAcrossBatch(int inputIndex) const { // return false; // } ================================================ FILE: real-esrgan/general-x4v3/src/pixel_shuffle/pixel_shuffle.cu ================================================ #include #include #include "pixel_shuffle/pixel_shuffle.hpp" // CUDA kernel for PixelShuffle __global__ void PixelShuffleKernel(const float* input, float* output, int batchSize, int channels, int height, int width, int upscaleFactor) { int outHeight = height * upscaleFactor; int outWidth = width * upscaleFactor; int outChannels = channels / (upscaleFactor * upscaleFactor); int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= batchSize * outChannels * outHeight * outWidth) return; int out_w = idx % outWidth; int out_h = (idx / outWidth) % outHeight; int out_c = (idx / outWidth / outHeight) % outChannels; int b = idx / (outWidth * outHeight * outChannels); int in_c = out_c * upscaleFactor * upscaleFactor + (out_h % upscaleFactor) * upscaleFactor + (out_w % upscaleFactor); int in_h = out_h / upscaleFactor; int in_w = out_w / upscaleFactor; output[idx] = input[((b * channels + in_c) * height + in_h) * width + in_w]; } int32_t PixelShufflePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); int batchSize = inputDesc[0].dims.d[0]; int channels = inputDesc[0].dims.d[1]; int height = inputDesc[0].dims.d[2]; int width = inputDesc[0].dims.d[3]; int upscaleFactor = mUpscaleFactor; int outChannels = channels / (upscaleFactor * upscaleFactor); int outHeight = height * upscaleFactor; int outWidth = width * upscaleFactor; int numElements = batchSize * outChannels * outHeight * outWidth; PixelShuffleKernel<<<(numElements + 255) / 256, 256>>>(input, output, batchSize, channels, height, width, upscaleFactor); return cudaGetLastError() != cudaSuccess; } ================================================ FILE: real-esrgan/x4plus/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(real-esrgan) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) if(WIN32) enable_language(CUDA) endif(WIN32) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED preprocess.cu postprocess.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) cuda_add_executable(real-esrgan real-esrgan.cpp) target_link_libraries(real-esrgan nvinfer) target_link_libraries(real-esrgan cudart) target_link_libraries(real-esrgan myplugins) target_link_libraries(real-esrgan ${OpenCV_LIBS}) if(UNIX) add_definitions(-O2 -pthread) endif(UNIX) ================================================ FILE: real-esrgan/x4plus/README.md ================================================ # Real-ESRGAN The Pytorch implementation is [real-esrgan](https://github.com/xinntao/Real-ESRGAN).

## Config - Input shape(**INPUT_H**, **INPUT_W**, **INPUT_C**) defined in real-esrgan.cpp - GPU id(**DEVICE**) can be selected by the macro in real-esrgan.cpp - **BATCH_SIZE** can be selected by the macro in real-esrgan.cpp - FP16/FP32 can be selected by **PRECISION_MODE** in real-esrgan.cpp - The example result can be visualized by **VISUALIZATION**. ## How to Run, real-esrgan as example 0. prepare test image - download : [OST_009.png](https://drive.google.com/file/d/1KAyAiQ8qHc5jSBkk2Uft2LfIhzi9XSyH/view?usp=sharing) ``` cd {tensorrtx}/real-esrgan/ mkdir sample cp ~/Download/OST_009.png {tensorrtx}/real-esrgan/sample ``` 1. generate .wts from pytorch with .pt, or download .wts from model zoo ``` git clone https://github.com/xinntao/Real-ESRGAN.git cd Real-ESRGAN pip install basicsr pip install facexlib pip install gfpgan pip install -r requirements.txt python setup.py develop // download https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth cp ~/RealESRGAN_x4plus.pth {xinntao}/Real-ESRGAN/experiments/pretrained_models cp {tensorrtx}/Real-ESRGAN/gen_wts.py {xinntao}/Real-ESRGAN cd {xinntao}/Real-ESRGAN python gen_wts.py // a file 'real-esrgan.wts' will be generated. ``` 2. build tensorrtx/real-esrgan and run ``` cd {tensorrtx}/real-esrgan/ mkdir build cd build cp {xinntao}/Real-ESRGAN/real-esrgan.wts {tensorrtx}/real-esrgan/build cmake .. make sudo ./real-esrgan -s [.wts] [.engine] // serialize model to plan file sudo ./real-esrgan -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. // For example // sudo ./real-esrgan -s ./real-esrgan.wts ./real-esrgan_f32.engine // sudo ./real-esrgan -d ./real-esrgan_f32.engine ../samples ``` 3. check the images generated, as follows. _OST_009.png ================================================ FILE: real-esrgan/x4plus/common.hpp ================================================ #ifndef REAL_ESRGAN_COMMON_H_ #define REAL_ESRGAN_COMMON_H_ #include #include #include #include #include #include "NvInfer.h" using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } ITensor* residualDenseBlock(INetworkDefinition *network, std::map& weightMap, ITensor* x, std::string lname) { IConvolutionLayer* conv_1 = network->addConvolutionNd(*x, 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); conv_1->setStrideNd(DimsHW{ 1, 1 }); conv_1->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_1 = network->addActivation(*conv_1->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_1->setAlpha(0.2); ITensor* x1 = leaky_relu_1->getOutput(0); ITensor* concat_input2[] = { x, x1 }; IConcatenationLayer* concat2 = network->addConcatenation(concat_input2, 2); concat2->setAxis(0); IConvolutionLayer* conv_2 = network->addConvolutionNd(*concat2->getOutput(0), 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); conv_2->setStrideNd(DimsHW{ 1, 1 }); conv_2->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_2 = network->addActivation(*conv_2->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_2->setAlpha(0.2); ITensor* x2 = leaky_relu_2->getOutput(0); ITensor* concat_input3[] = { x, x1, x2 }; IConcatenationLayer* concat3 = network->addConcatenation(concat_input3, 3); concat3->setAxis(0); IConvolutionLayer* conv_3 = network->addConvolutionNd(*concat3->getOutput(0), 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv3.weight"], weightMap[lname + ".conv3.bias"]); conv_3->setStrideNd(DimsHW{ 1, 1 }); conv_3->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_3 = network->addActivation(*conv_3->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_3->setAlpha(0.2); ITensor* x3 = leaky_relu_3->getOutput(0); ITensor* concat_input4[] = { x, x1, x2, x3 }; IConcatenationLayer* concat4 = network->addConcatenation(concat_input4, 4); concat4->setAxis(0); IConvolutionLayer* conv_4 = network->addConvolutionNd(*concat4->getOutput(0), 32, DimsHW{ 3, 3 }, weightMap[lname + ".conv4.weight"], weightMap[lname + ".conv4.bias"]); conv_4->setStrideNd(DimsHW{ 1, 1 }); conv_4->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_4 = network->addActivation(*conv_4->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_4->setAlpha(0.2); ITensor* x4 = leaky_relu_4->getOutput(0); ITensor* concat_input5[] = { x, x1, x2, x3, x4 }; IConcatenationLayer* concat5 = network->addConcatenation(concat_input5, 5); concat5->setAxis(0); IConvolutionLayer* conv_5 = network->addConvolutionNd(*concat5->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap[lname + ".conv5.weight"], weightMap[lname + ".conv5.bias"]); conv_5->setStrideNd(DimsHW{ 1, 1 }); conv_5->setPaddingNd(DimsHW{ 1, 1 }); ITensor* x5 = conv_5->getOutput(0); float *scval = reinterpret_cast(malloc(sizeof(float))); *scval = 0.2; Weights scale{ DataType::kFLOAT, scval, 1 }; float *shval = reinterpret_cast(malloc(sizeof(float))); *shval = 0.0; Weights shift{ DataType::kFLOAT, shval, 1 }; float *pval = reinterpret_cast(malloc(sizeof(float))); *pval = 1.0; Weights power{ DataType::kFLOAT, pval, 1 }; IScaleLayer* scaled = network->addScale(*x5, ScaleMode::kUNIFORM, shift, scale, power); IElementWiseLayer* ew1 = network->addElementWise(*scaled->getOutput(0), *x, ElementWiseOperation::kSUM); return ew1->getOutput(0); } ITensor* RRDB(INetworkDefinition *network, std::map& weightMap, ITensor* x, std::string lname) { ITensor* out = residualDenseBlock(network, weightMap, x, lname + ".rdb1"); out = residualDenseBlock(network, weightMap, out, lname + ".rdb2"); out = residualDenseBlock(network, weightMap, out, lname + ".rdb3"); float *scval = reinterpret_cast(malloc(sizeof(float))); *scval = 0.2; Weights scale{ DataType::kFLOAT, scval, 1 }; float *shval = reinterpret_cast(malloc(sizeof(float))); *shval = 0.0; Weights shift{ DataType::kFLOAT, shval, 1 }; float *pval = reinterpret_cast(malloc(sizeof(float))); *pval = 1.0; Weights power{ DataType::kFLOAT, pval, 1 }; IScaleLayer* scaled = network->addScale(*out, ScaleMode::kUNIFORM, shift, scale, power); IElementWiseLayer* ew1 = network->addElementWise(*scaled->getOutput(0), *x, ElementWiseOperation::kSUM); return ew1->getOutput(0); } #endif ================================================ FILE: real-esrgan/x4plus/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: real-esrgan/x4plus/gen_wts.py ================================================ import argparse import os import struct from basicsr.archs.rrdbnet_arch import RRDBNet from realesrgan import RealESRGANer from realesrgan.archs.srvgg_arch import SRVGGNetCompact def main(): """Inference demo for Real-ESRGAN. """ parser = argparse.ArgumentParser() #parser.add_argument('-i', '--input', type=str, default='../TestData3', help='Input image or folder') parser.add_argument('-i', '--input', type=str, default='inputs', help='Input image or folder') parser.add_argument( '-n', '--model_name', type=str, default='RealESRGAN_x4plus', help=('Model names: RealESRGAN_x4plus | RealESRNet_x4plus | RealESRGAN_x4plus_anime_6B | RealESRGAN_x2plus | ' 'realesr-animevideov3')) parser.add_argument('-o', '--output', type=str, default='results', help='Output folder') parser.add_argument('-s', '--outscale', type=float, default=4, help='The final upsampling scale of the image') parser.add_argument('--suffix', type=str, default='out', help='Suffix of the restored image') parser.add_argument('-t', '--tile', type=int, default=0, help='Tile size, 0 for no tile during testing') parser.add_argument('--tile_pad', type=int, default=10, help='Tile padding') parser.add_argument('--pre_pad', type=int, default=0, help='Pre padding size at each border') parser.add_argument('--face_enhance', action='store_true', help='Use GFPGAN to enhance face') parser.add_argument( '--fp32', action='store_true', help='Use fp32 precision during inference. Default: fp16 (half precision).') parser.add_argument( '--alpha_upsampler', type=str, default='realesrgan', help='The upsampler for the alpha channels. Options: realesrgan | bicubic') parser.add_argument( '--ext', type=str, default='auto', help='Image extension. Options: auto | jpg | png, auto means using the same extension as inputs') args = parser.parse_args() # determine models according to model names args.model_name = args.model_name.split('.')[0] if args.model_name in ['RealESRGAN_x4plus', 'RealESRNet_x4plus']: # x4 RRDBNet model model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4) netscale = 4 elif args.model_name in ['RealESRGAN_x4plus_anime_6B']: # x4 RRDBNet model with 6 blocks model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4) netscale = 4 elif args.model_name in ['RealESRGAN_x2plus']: # x2 RRDBNet model model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2) netscale = 2 elif args.model_name in ['realesr-animevideov3']: # x4 VGG-style model (XS size) model = SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu') netscale = 4 # determine model paths model_path = os.path.join('experiments/pretrained_models', args.model_name + '.pth') if not os.path.isfile(model_path): model_path = os.path.join('realesrgan/weights', args.model_name + '.pth') if not os.path.isfile(model_path): raise ValueError(f'Model {args.model_name} does not exist.') # restorer upsampler = RealESRGANer( scale=netscale, model_path=model_path, model=model, tile=args.tile, tile_pad=args.tile_pad, pre_pad=args.pre_pad, half=args.fp32) if os.path.isfile('real-esrgan.wts'): print('Already, real-esrgan.wts file exists.') else: print('making real-esrgan.wts file ...') f = open("real-esrgan.wts", 'w') f.write("{}\n".format(len(upsampler.model.state_dict().keys()))) for k, v in upsampler.model.state_dict().items(): print('key: ', k) print('value: ', v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") print('Completed real-esrgan.wts file!') if __name__ == '__main__': main() ================================================ FILE: real-esrgan/x4plus/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: real-esrgan/x4plus/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: real-esrgan/x4plus/postprocess.cu ================================================ #include "cuda_utils.h" using namespace std; // postprocess (NCHW->NHWC, RGB->BGR, *255, ROUND, uint8) __global__ void postprocess_kernel(uint8_t* output, float* input, const int batchSize, const int height, const int width, const int channel, const int thread_count) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= thread_count) return; const int c_idx = index % channel; int idx = index / channel; const int w_idx = idx % width; idx /= width; const int h_idx = idx % height; const int b_idx = idx / height; int g_idx = b_idx * height * width * channel + (2 - c_idx)* height * width + h_idx * width + w_idx; float tt = input[g_idx] * 255.f; if (tt > 255) tt = 255; output[index] = tt; } void postprocess(uint8_t* output, float*input, int batchSize, int height, int width, int channel, cudaStream_t stream) { int thread_count = batchSize * height * width * channel; int block = 512; int grid = (thread_count - 1) / block + 1; postprocess_kernel << > > (output, input, batchSize, height, width, channel, thread_count); } #include "postprocess.hpp" namespace nvinfer1 { int PostprocessPluginV2::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { float* input = (float*)inputs[0]; uint8_t* output = (uint8_t*)outputs[0]; const int H = mPostprocess.H; const int W = mPostprocess.W; const int C = mPostprocess.C; postprocess(output, input, batchSize, H, W, C, stream); return 0; } } ================================================ FILE: real-esrgan/x4plus/postprocess.hpp ================================================ #pragma once #include #include #include "macros.h" #include struct Postprocess { int N; int C; int H; int W; }; namespace nvinfer1 { class PostprocessPluginV2 : public IPluginV2IOExt { public: PostprocessPluginV2(const Postprocess& arg) { mPostprocess = arg; } PostprocessPluginV2(const void* data, size_t length) { const char* d = static_cast(data); const char* const a = d; mPostprocess = read(d); assert(d == a + length); } PostprocessPluginV2() = delete; virtual ~PostprocessPluginV2() {} public: int getNbOutputs() const noexcept override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override { return Dims3(mPostprocess.H, mPostprocess.W, mPostprocess.C); } int initialize() noexcept override { return 0; } void terminate() noexcept override { } size_t getWorkspaceSize(int maxBatchSize) const noexcept override { return 0; } int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; size_t getSerializationSize() const noexcept override { size_t serializationSize = 0; serializationSize += sizeof(mPostprocess); return serializationSize; } void serialize(void* buffer) const noexcept override { char* d = static_cast(buffer); const char* const a = d; write(d, mPostprocess); assert(d == a + getSerializationSize()); } void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) noexcept override { } //! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported. bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept override { assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs); bool condition = inOut[pos].format == TensorFormat::kLINEAR; condition &= inOut[pos].type != DataType::kINT32; condition &= inOut[pos].type == inOut[0].type; return condition; } DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const noexcept override { assert(inputTypes && nbInputs == 1); return DataType::kFLOAT; // } const char* getPluginType() const noexcept override { return "postprocess"; } const char* getPluginVersion() const noexcept override { return "1"; } void destroy() noexcept override { delete this; } IPluginV2Ext* clone() const noexcept override { PostprocessPluginV2* plugin = new PostprocessPluginV2(*this); return plugin; } void setPluginNamespace(const char* libNamespace) noexcept override { mNamespace = libNamespace; } const char* getPluginNamespace() const noexcept override { return mNamespace.data(); } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept override { return false; } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template T read(const char*& buffer) const { T val = *reinterpret_cast(buffer); buffer += sizeof(T); return val; } private: Postprocess mPostprocess; std::string mNamespace; }; class PostprocessPluginV2Creator : public IPluginCreator { public: const char* getPluginName() const noexcept override { return "postprocess"; } const char* getPluginVersion() const noexcept override { return "1"; } const PluginFieldCollection* getFieldNames() noexcept override { return nullptr; } IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override { PostprocessPluginV2* plugin = new PostprocessPluginV2(*(Postprocess*)fc); mPluginName = name; return plugin; } IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override { auto plugin = new PostprocessPluginV2(serialData, serialLength); mPluginName = name; return plugin; } void setPluginNamespace(const char* libNamespace) noexcept override { mNamespace = libNamespace; } const char* getPluginNamespace() const noexcept override { return mNamespace.c_str(); } private: std::string mNamespace; std::string mPluginName; }; REGISTER_TENSORRT_PLUGIN(PostprocessPluginV2Creator); }; ================================================ FILE: real-esrgan/x4plus/preprocess.cu ================================================ #include "cuda_utils.h" using namespace std; // preprocess (NHWC->NCHW, BGR->RGB, [0, 255]->[0, 1](Normalize)) __global__ void preprocess_kernel(float* output, uint8_t* input, const int batchSize, const int height, const int width, const int channel, const int thread_count) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= thread_count) return; const int w_idx = index % width; int idx = index / width; const int h_idx = idx % height; idx /= height; const int c_idx = idx % channel; const int b_idx = idx / channel; int g_idx = b_idx * height * width * channel + h_idx * width * channel + w_idx * channel + 2 - c_idx; output[index] = input[g_idx] / 255.f; } void preprocess(float* output, uint8_t*input, int batchSize, int height, int width, int channel, cudaStream_t stream) { int thread_count = batchSize * height * width * channel; int block = 512; int grid = (thread_count - 1) / block + 1; preprocess_kernel << > > (output, input, batchSize, height, width, channel, thread_count); } #include "preprocess.hpp" namespace nvinfer1 { int PreprocessPluginV2::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { uint8_t* input = (uint8_t*)inputs[0]; float* output = (float*)outputs[0]; const int H = mPreprocess.H; const int W = mPreprocess.W; const int C = mPreprocess.C; preprocess(output, input, batchSize, H, W, C, stream); return 0; } } ================================================ FILE: real-esrgan/x4plus/preprocess.hpp ================================================ #pragma once #include #include #include "macros.h" #include struct Preprocess { int N; int C; int H; int W; }; namespace nvinfer1 { class PreprocessPluginV2 : public IPluginV2IOExt { public: PreprocessPluginV2(const Preprocess& arg) { mPreprocess = arg; } PreprocessPluginV2(const void* data, size_t length) { const char* d = static_cast(data); const char* const a = d; mPreprocess = read(d); assert(d == a + length); } PreprocessPluginV2() = delete; virtual ~PreprocessPluginV2() {} public: int getNbOutputs() const noexcept override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override { return Dims3(mPreprocess.C, mPreprocess.H, mPreprocess.W); } int initialize() noexcept override { return 0; } void terminate() noexcept override { } size_t getWorkspaceSize(int maxBatchSize) const noexcept override { return 0; } int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; size_t getSerializationSize() const noexcept override { size_t serializationSize = 0; serializationSize += sizeof(mPreprocess); return serializationSize; } void serialize(void* buffer) const noexcept override { char* d = static_cast(buffer); const char* const a = d; write(d, mPreprocess); assert(d == a + getSerializationSize()); } void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) noexcept override { } //! The combination of kLINEAR + kINT8/kHALF/kFLOAT is supported. bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept override { assert(nbInputs == 1 && nbOutputs == 1 && pos < nbInputs + nbOutputs); bool condition = inOut[pos].format == TensorFormat::kLINEAR; condition &= inOut[pos].type != DataType::kINT32; condition &= inOut[pos].type == inOut[0].type; return condition; } DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const noexcept override { assert(inputTypes && nbInputs == 1); return DataType::kFLOAT; // } const char* getPluginType() const noexcept override { return "preprocess"; } const char* getPluginVersion() const noexcept override { return "1"; } void destroy() noexcept override { delete this; } IPluginV2Ext* clone() const noexcept override { PreprocessPluginV2* plugin = new PreprocessPluginV2(*this); return plugin; } void setPluginNamespace(const char* libNamespace) noexcept override { mNamespace = libNamespace; } const char* getPluginNamespace() const noexcept override { return mNamespace.data(); } bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept override { return false; } bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept override { return false; } private: template void write(char*& buffer, const T& val) const { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template T read(const char*& buffer) const { T val = *reinterpret_cast(buffer); buffer += sizeof(T); return val; } private: Preprocess mPreprocess; std::string mNamespace; }; class PreprocessPluginV2Creator : public IPluginCreator { public: const char* getPluginName() const noexcept override { return "preprocess"; } const char* getPluginVersion() const noexcept override { return "1"; } const PluginFieldCollection* getFieldNames() noexcept override { return nullptr; } IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override { PreprocessPluginV2* plugin = new PreprocessPluginV2(*(Preprocess*)fc); mPluginName = name; return plugin; } IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override { auto plugin = new PreprocessPluginV2(serialData, serialLength); mPluginName = name; return plugin; } void setPluginNamespace(const char* libNamespace) noexcept override { mNamespace = libNamespace; } const char* getPluginNamespace() const noexcept override { return mNamespace.c_str(); } private: std::string mNamespace; std::string mPluginName; }; REGISTER_TENSORRT_PLUGIN(PreprocessPluginV2Creator); }; ================================================ FILE: real-esrgan/x4plus/real-esrgan.cpp ================================================ #include "cuda_utils.h" #include "common.hpp" #include "preprocess.hpp"// preprocess plugin #include "postprocess.hpp"// postprocess plugin #include "logging.h" #include "utils.h" #include //access() #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // stuff we know about the network and the input/output blobs static const int PRECISION_MODE = 32; // fp32 : 32, fp16 : 16 static const bool VISUALIZATION = true; static const int INPUT_H = 640; static const int INPUT_W = 448; static const int INPUT_C = 3; static const int OUT_SCALE = 4; static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; // Creat the engine using only the API and not any parser. ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {INPUT_H, INPUT_W, INPUT_C} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ INPUT_H, INPUT_W, INPUT_C }); assert(data); std::map weightMap = loadWeights(wts_name); // Custom preprocess (NHWC->NCHW, BGR->RGB, [0, 255]->[0, 1](Normalize)) Preprocess preprocess{ maxBatchSize, INPUT_C, INPUT_H, INPUT_W }; IPluginCreator* preprocess_creator = getPluginRegistry()->getPluginCreator("preprocess", "1"); IPluginV2 *preprocess_plugin = preprocess_creator->createPlugin("preprocess_plugin", (PluginFieldCollection*)&preprocess); IPluginV2Layer* preprocess_layer = network->addPluginV2(&data, 1, *preprocess_plugin); preprocess_layer->setName("preprocess_layer"); ITensor* prep = preprocess_layer->getOutput(0); // conv_first IConvolutionLayer* conv_first = network->addConvolutionNd(*prep, 64, DimsHW{ 3, 3 }, weightMap["conv_first.weight"], weightMap["conv_first.bias"]); conv_first->setStrideNd(DimsHW{ 1, 1 }); conv_first->setPaddingNd(DimsHW{ 1, 1 }); conv_first->setName("conv_first"); ITensor* feat = conv_first->getOutput(0); // conv_body ITensor* body_feat = RRDB(network, weightMap, feat, "body.0"); for (int idx = 1; idx < 23; idx++) { body_feat = RRDB(network, weightMap, body_feat, "body." + std::to_string(idx)); } IConvolutionLayer* conv_body = network->addConvolutionNd(*body_feat, 64, DimsHW{ 3, 3 }, weightMap["conv_body.weight"], weightMap["conv_body.bias"]); conv_body->setStrideNd(DimsHW{ 1, 1 }); conv_body->setPaddingNd(DimsHW{ 1, 1 }); IElementWiseLayer* ew1 = network->addElementWise(*feat, *conv_body->getOutput(0), ElementWiseOperation::kSUM); feat = ew1->getOutput(0); //upsample IResizeLayer* interpolate_nearest = network->addResize(*feat); float sclaes1[] = { 1, 2, 2 }; interpolate_nearest->setScales(sclaes1, 3); interpolate_nearest->setResizeMode(ResizeMode::kNEAREST); IConvolutionLayer* conv_up1 = network->addConvolutionNd(*interpolate_nearest->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_up1.weight"], weightMap["conv_up1.bias"]); conv_up1->setStrideNd(DimsHW{ 1, 1 }); conv_up1->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_1 = network->addActivation(*conv_up1->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_1->setAlpha(0.2); IResizeLayer* interpolate_nearest2 = network->addResize(*leaky_relu_1->getOutput(0)); float sclaes2[] = { 1, 2, 2 }; interpolate_nearest2->setScales(sclaes2, 3); interpolate_nearest2->setResizeMode(ResizeMode::kNEAREST); IConvolutionLayer* conv_up2 = network->addConvolutionNd(*interpolate_nearest2->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_up2.weight"], weightMap["conv_up2.bias"]); conv_up2->setStrideNd(DimsHW{ 1, 1 }); conv_up2->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_2 = network->addActivation(*conv_up2->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_2->setAlpha(0.2); IConvolutionLayer* conv_hr = network->addConvolutionNd(*leaky_relu_2->getOutput(0), 64, DimsHW{ 3, 3 }, weightMap["conv_hr.weight"], weightMap["conv_hr.bias"]); conv_hr->setStrideNd(DimsHW{ 1, 1 }); conv_hr->setPaddingNd(DimsHW{ 1, 1 }); IActivationLayer* leaky_relu_hr = network->addActivation(*conv_hr->getOutput(0), ActivationType::kLEAKY_RELU); leaky_relu_hr->setAlpha(0.2); IConvolutionLayer* conv_last = network->addConvolutionNd(*leaky_relu_hr->getOutput(0), 3, DimsHW{ 3, 3 }, weightMap["conv_last.weight"], weightMap["conv_last.bias"]); conv_last->setStrideNd(DimsHW{ 1, 1 }); conv_last->setPaddingNd(DimsHW{ 1, 1 }); ITensor* out = conv_last->getOutput(0); // Custom postprocess (RGB -> BGR, NCHW->NHWC, *255, ROUND, uint8) Postprocess postprocess{ maxBatchSize, out->getDimensions().d[0], out->getDimensions().d[1], out->getDimensions().d[2] }; IPluginCreator* postprocess_creator = getPluginRegistry()->getPluginCreator("postprocess", "1"); IPluginV2 *postprocess_plugin = postprocess_creator->createPlugin("postprocess_plugin", (PluginFieldCollection*)&postprocess); IPluginV2Layer* postprocess_layer = network->addPluginV2(&out, 1, *postprocess_plugin); postprocess_layer->setName("postprocess_layer"); ITensor* final_tensor = postprocess_layer->getOutput(0); final_tensor->setName(OUTPUT_BLOB_NAME); network->markOutput(*final_tensor); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB if (PRECISION_MODE == 16) { std::cout << "==== precision f16 ====" << std::endl << std::endl; config->setFlag(BuilderFlag::kFP16); } else { std::cout << "==== precision f32 ====" << std::endl << std::endl; } std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down delete engine; delete builder; delete config; } void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, uint8_t* output, int batchSize) { // infer on the batch asynchronously, and DMA output back to host context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 4) { wts = std::string(argv[2]); engine = std::string(argv[3]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } // ./real-esrgan -s ./real-esrgan.wts ./real-esrgan_f32.engine // ./real-esrgan -d ./real-esrgan_f32.engine ../samples int main(int argc, char** argv) { std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./real-esrgan -s [.wts] [.engine] // serialize model to plan file" << std::endl; std::cerr << "./real-esrgan -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream if (!wts_name.empty()) { IHostMemory* modelStream{ nullptr }; APIToModel(BATCH_SIZE, &modelStream, wts_name); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); delete modelStream; return 0; } // deserialize the .engine and run inference std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; return -1; } char *trtModelStream = nullptr; size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; assert(engine->getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(uint8_t))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(uint8_t))); std::vector input(BATCH_SIZE * INPUT_H * INPUT_W * INPUT_C); std::vector outputs(BATCH_SIZE * OUTPUT_SIZE); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); std::vector imgs_buffer(BATCH_SIZE); for (int f = 0; f < (int)file_names.size(); f++) { for (int b = 0; b < BATCH_SIZE; b++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[f]); if (img.empty()) continue; memcpy(input.data() + b * INPUT_H * INPUT_W * INPUT_C, img.data, INPUT_H * INPUT_W * INPUT_C); } CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input.data(), BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W * sizeof(uint8_t), cudaMemcpyHostToDevice, stream)); // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, stream, (void**)buffers, outputs.data(), BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } cv::Mat frame = cv::Mat(INPUT_H * OUT_SCALE, INPUT_W * OUT_SCALE, CV_8UC3, outputs.data()); cv::imwrite("../_" + file_names[0] + ".png", frame); if (VISUALIZATION) { cv::imshow("result : " + file_names[0], frame); cv::waitKey(0); } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); // Destroy the engine delete context; delete engine; delete runtime; } ================================================ FILE: real-esrgan/x4plus/utils.h ================================================ #ifndef TRTX_REAL_ESRGAN_UTILS_H_ #define TRTX_REAL_ESRGAN_UTILS_H_ #include #include static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } #endif // TRTX_REAL_ESRGAN_UTILS_H_ ================================================ FILE: refinedet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(refinedet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) # tensorrt include_directories(/data_2/tensorrt/TensorRT-7.0.0.11/include/) #include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/data_2/tensorrt/TensorRT-7.0.0.11/lib/) #link_directories(/usr/lib/x86_64-linux-gnu/) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) #find_package(OpenCV) #include_directories(OpenCV_INCLUDE_DIRS) include_directories(/home/software_install/opencv3.4.6/include) link_directories(/home/software_install/opencv3.4.6/lib) set(CMAKE_PREFIX_PATH "/data_1/torch1.1.0") ###torch1.1.0 find_package(Torch REQUIRED) include_directories(/data_1/torch1.1.0/include) link_directories(/data_1/torch1.1.0/lib) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") add_executable(refinedet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/refinedet.cpp) target_link_libraries(refinedet nvinfer) target_link_libraries(refinedet cudart) target_link_libraries(refinedet "${TORCH_LIBRARIES}") target_link_libraries(refinedet opencv_calib3d opencv_core opencv_dnn opencv_imgproc opencv_highgui opencv_imgcodecs caffe2) add_definitions(-O2 -pthread) ================================================ FILE: refinedet/README.md ================================================ # RefineDet For the Pytorch implementation, you can refer to [luuuyi/RefineDet.PyTorch](https://github.com/luuuyi/RefineDet.PyTorch) ## How to run ``` 1. generate wts file. from pytorch python gen_wts_refinedet.py // a file 'refinedet.wts' will be generated. 2. build tensorrtx/RefineDet and run or Using clion to open a project(recommend) Configuration file in configure.h You need configure your own paths and modes(SERIALIZE or INFER) Detailed information reference configure.h mkdir build cd build cmake .. make ``` ## dependence ``` TensorRT7.0.0.11 OpenCV >= 3.4 libtorch >=1.1.0 ``` ## feature 1.tensorrt Multi output 2.L2norm 3.Postprocessing with libtorch ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) [tensorrt tutorials](https://github.com/wang-xinyu/tensorrtx/tree/master/tutorials) For more detailed guidance, see [yhl blog](https://www.cnblogs.com/yanghailin/p/14525128.html) ================================================ FILE: refinedet/calibrator.cpp ================================================ #include #include #include #include #include "calibrator.h" #include "cuda_runtime_api.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()){ std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } // cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(temp); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0, cv::Size(input_w_, input_h_), cv::Scalar(123.0, 117.0, 104.0), true, false); // cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: refinedet/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include "NvInfer.h" #include #include //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const override; bool getBatch(void* bindings[], const char* names[], int nbBindings) override; const void* readCalibrationCache(size_t& length) override; void writeCalibrationCache(const void* cache, size_t length) override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: refinedet/configure.h ================================================ #define USE_FP32 // set USE_INT8 or USE_FP16 or USE_FP32 const int num_class = 25; //num_class + 1 //Including background class //SERIALIZE: It indicates that to generate engin by serialization, the following path needs to be set,path_wts_ and path_save_engine //INFER: It shows that it is a reasoning mode,the following path needs to be set,path_engine #define INFER //SERIALIZE INFER const std::string path_engine = "/data_2//cmake-build-debug/refinedet_0312-now.engine"; const std::string path_wts = "/data_1/refinedet/pytorch_refinedet-master/refinedet0312.wts"; const std::string path_save_engine = "./refinedet_0312-now.engine"; //Picture folder to be detected const char *p_dir_name = "/data_1/img/"; const float TH = 0.2; //Confidence threshold const int T_show = 1; //1:Show the effect 0:Test map to generate TXT //The path to save the generated TXT when testing the map std::string save_path_txt = "/data_1/txt/"; #define DEVICE 0 // GPU id // stuff we know about the network and the input/output blobs static const int INPUT_H = 320; static const int INPUT_W = 320; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME_arm_loc = "arm_loc"; const char* OUTPUT_BLOB_NAME_arm_conf = "arm_conf"; const char* OUTPUT_BLOB_NAME_odm_loc = "odm_loc"; const char* OUTPUT_BLOB_NAME_odm_conf = "odm_conf"; std::string label_map[] = { "background", "aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo", "pp", "qq", "rr", "ss", "tt", "uu", "vv", "ww", "xx" }; ================================================ FILE: refinedet/gen_wts_refinedet.py ================================================ import torch import torch.nn as nn import struct from models.refinedet import build_refinedet num_classes = 25 path_model = "/data_2/project_2021/pytorch_refinedet/2021/20210308.pth" path_save_wts = "./refinedet0312.wts" input_size = 320 net = build_refinedet('test', input_size, num_classes) # initialize net net.load_state_dict(torch.load(path_model)) net.eval() f = open(path_save_wts, 'w') f.write('{}\n'.format(len(net.state_dict().keys()))) for k, v in net.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') print("success generate wts!") ================================================ FILE: refinedet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: refinedet/refinedet.cpp ================================================ #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "utils.h" #include "logging.h" #include "calibrator.h" #include "configure.h" #include // One-stop header. #include "torch/torch.h" #include "torch/jit.h" using namespace nvinfer1; static Logger gLogger; //Correct the rectangle area to prevent the image from crossing the boundary void RoiCorrect(const cv::Mat &m, cv::Rect &r) { if (r.x < 0) r.x = 0; if (r.y < 0) r.y = 0; if(r.x >= m.cols-1) r.x=0; if(r.y >= m.rows-1) r.y=0; if(r.width <= 0) r.width = 1; if(r.height <= 0) r.height = 1; if(r.x + r.width > m.cols - 1) r.width = m.cols - 1 - r.x; if(r.y + r.height > m.rows - 1) r.height = m.rows - 1 - r.y; } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } //convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); ILayer* convRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p,\ int linx, const std::string pre_name = "vgg.", bool b_dilate = false) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; if (weightMap.count(pre_name + std::to_string(linx) + ".weight") == 0) std::cout << "no key: " <addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[pre_name + std::to_string(linx) + ".weight"], weightMap[pre_name + std::to_string(linx) + ".bias"]); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); if(true == b_dilate) { conv1->setDilation(DimsHW{3, 3}); } auto lr = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); return lr; } //convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); ILayer* convRelu_extras(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, const std::string weight_name, const std::string bias_name){ if (weightMap.count(weight_name) == 0) std::cout << "no key: " <addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[weight_name], weightMap[bias_name]); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); auto lr = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); return lr; } //convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); IConvolutionLayer* convReluconv_tcb0(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int indx_0, int indx_1){ std::string name_w0 = "tcb0." + (std::string)std::to_string(indx_0) + ".weight"; std::string name_b0 = "tcb0." + (std::string)std::to_string(indx_0) + ".bias"; std::string name_w1 = "tcb0." + (std::string)std::to_string(indx_1) + ".weight"; std::string name_b1 = "tcb0." + (std::string)std::to_string(indx_1) + ".bias"; if (weightMap.count(name_w0) == 0) std::cout << "no key: " <addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[name_w0], weightMap[name_b0]); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); auto lr = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); IConvolutionLayer* conv2 = network->addConvolutionNd(*lr->getOutput(0), 256, DimsHW{3, 3}, weightMap[name_w1], weightMap[name_b1]); assert(conv2); conv2->setStrideNd(DimsHW{1, 1}); conv2->setPaddingNd(DimsHW{1, 1}); return conv2; } ILayer* ReluconvRelu_tcb2(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int indx_0){ auto lr = network->addActivation(input, ActivationType::kRELU); std::string name_w0 = "tcb2." + (std::string)std::to_string(indx_0) + ".weight"; std::string name_b0 = "tcb2." + (std::string)std::to_string(indx_0) + ".bias"; if (weightMap.count(name_w0) == 0) std::cout << "no key: " <addConvolutionNd(*lr->getOutput(0), outch, DimsHW{ksize, ksize}, weightMap[name_w0], weightMap[name_b0]); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); auto lr1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); return lr1; } ILayer* conv_permutation(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, const std::string weight_name, const std::string bias_name) { if (weightMap.count(weight_name) == 0) std::cout << "no key: " <addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[weight_name], weightMap[bias_name]); assert(a0); a0->setStrideNd(DimsHW{s, s}); a0->setPaddingNd(DimsHW{p, p}); auto sfl = network->addShuffle(*a0->getOutput(0)); sfl->setFirstTranspose(Permutation{1, 2, 0}); return sfl; } ILayer* cat_4_tensor(INetworkDefinition *network, ILayer*tensor_0, ILayer*tensor_1, ILayer*tensor_2, ILayer*tensor_3) { Dims dim_; dim_.nbDims=1; dim_.d[0]=-1; //40 40 12 --->>40*40*12 auto arm_loc_00 = network->addShuffle(*tensor_0->getOutput(0)); assert(arm_loc_00); arm_loc_00->setReshapeDimensions(dim_); //20 20 12 --->>20*20*12 auto arm_loc_11 = network->addShuffle(*tensor_1->getOutput(0)); assert(arm_loc_11); arm_loc_11->setReshapeDimensions(dim_); //Dims2(-1, 1) //10 10 12 --->>10*10*12 auto arm_loc_22 = network->addShuffle(*tensor_2->getOutput(0)); assert(arm_loc_22); arm_loc_22->setReshapeDimensions(dim_); //5 5 12 --->>5*5*12 auto arm_loc_33 = network->addShuffle(*tensor_3->getOutput(0)); assert(arm_loc_33); arm_loc_33->setReshapeDimensions(dim_); // // Dims dim0 = arm_loc_00->getOutput(0)->getDimensions(); // std::cout <<"debug arm_loc_0 dim==" << dim0.d[0] << " " << dim0.d[1] << " " << dim0.d[2] << " " << dim0.d[3] << std::endl; // Dims dim1 = arm_loc_11->getOutput(0)->getDimensions(); // std::cout <<"debug arm_loc_1 dim==" << dim1.d[0] << " " << dim1.d[1] << " " << dim1.d[2] << " " << dim1.d[3] << std::endl; // Dims dim2 = arm_loc_22->getOutput(0)->getDimensions(); // std::cout <<"debug arm_loc_2 dim==" << dim2.d[0] << " " << dim2.d[1] << " " << dim2.d[2] << " " << dim2.d[3] << std::endl; // Dims dim3 = arm_loc_33->getOutput(0)->getDimensions(); // std::cout <<"debug arm_loc_3 dim==" << dim3.d[0] << " " << dim3.d[1] << " " << dim3.d[2] << " " << dim3.d[3] << std::endl; ITensor* arm_loc_t[] = {arm_loc_00->getOutput(0), arm_loc_11->getOutput(0), arm_loc_22->getOutput(0), arm_loc_33->getOutput(0)}; auto arm_loc = network->addConcatenation(arm_loc_t, 4); //[25500] return arm_loc; } ILayer* reshapeSoftmax(INetworkDefinition *network, ITensor& input, int ch) { //The input is one-dimensional[12750] //reshape[XX,ch] auto re1 = network->addShuffle(input); assert(re1); re1->setReshapeDimensions(Dims3(1, -1, ch)); //[1,6375,2]; // re1->setReshapeDimensions(Dims2(-1, ch)); //[6375,2]; Dims dim0 = re1->getOutput(0)->getDimensions(); std::cout <<"debug re1 dim==" << dim0.d[0] << " " << dim0.d[1] << " " << dim0.d[2] << " " << dim0.d[3] << std::endl; // return re1;///////////////////////////////////////// auto sm = network->addSoftMax(*re1->getOutput(0)); sm->setAxes(1<<2); assert(sm); //And then reshape one-dimensional again, and it's the same shape as it came in Dims dim_; dim_.nbDims=1; dim_.d[0]=-1; auto re2 = network->addShuffle(*sm->getOutput(0)); assert(re2); re2->setReshapeDimensions(dim_); return re2; } IScaleLayer* L2norm(INetworkDefinition *network, std::map& weightMap, ITensor& input, const std::string pre_name = "conv4_3_L2Norm.weight") { //aa = x.pow(2) ## [1,512,40,40] const static float pval1[3]{0.0, 1.0, 2.0}; Weights wshift1{DataType::kFLOAT, pval1, 1}; Weights wscale1{DataType::kFLOAT, pval1+1, 1}; Weights wpower1{DataType::kFLOAT, pval1+2, 1}; IScaleLayer* scale1 = network->addScale( input, ScaleMode::kUNIFORM, wshift1, wscale1, wpower1); assert(scale1); //bb = x.pow(2).sum(dim=1, keepdim=True) ## [1,1,40,40] IReduceLayer* reduce1 = network->addReduce(*scale1->getOutput(0), ReduceOperation::kSUM, 1, true); assert(reduce1); //norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps # [1,1,40,40] const static float pval2[3]{0.0, 1.0, 0.5}; Weights wshift2{DataType::kFLOAT, pval2, 1}; Weights wscale2{DataType::kFLOAT, pval2+1, 1}; Weights wpower2{DataType::kFLOAT, pval2+2, 1}; IScaleLayer* scale2 = network->addScale( *reduce1->getOutput(0), ScaleMode::kUNIFORM, wshift2, wscale2, wpower2); assert(scale2); // x = torch.div(x,norm) IElementWiseLayer* ew2 = network->addElementWise(input, *scale2->getOutput(0), ElementWiseOperation::kDIV); assert(ew2); //out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x int len = weightMap[pre_name].count; float* pval3 = reinterpret_cast(malloc(sizeof(float) * len)); std::fill_n(pval3, len, 1.0); Weights wpower3{DataType::kFLOAT, pval3, len}; weightMap[pre_name + ".power3"] = wpower3; float* pval4 = reinterpret_cast(malloc(sizeof(float) * len)); std::fill_n(pval4, len, 0.0); Weights wpower4{DataType::kFLOAT, pval4, len}; weightMap[pre_name + ".power4"] = wpower4; IScaleLayer* scale3 = network->addScale( *ew2->getOutput(0), ScaleMode::kCHANNEL, wpower4, weightMap[pre_name], wpower3); assert(scale3); return scale3; } //convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); return lr; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights(path_wts); Weights emptywts{DataType::kFLOAT, nullptr, 0}; DimsHW maxpool_hw = DimsHW(2,2); auto lr0 = convRelu(network, weightMap, *data, 64, 3, 1, 1, 0); auto lr1 = convRelu(network, weightMap, *lr0->getOutput(0), 64, 3, 1, 1, 2); IPoolingLayer* pool1 = network->addPoolingNd(*lr1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); auto lr2 = convRelu(network, weightMap, *pool1->getOutput(0), 128, 3, 1, 1, 5); auto lr3 = convRelu(network, weightMap, *lr2->getOutput(0), 128, 3, 1, 1, 7); IPoolingLayer* pool2 = network->addPoolingNd(*lr3->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool2); pool2->setStrideNd(DimsHW{2, 2}); auto lr4 = convRelu(network, weightMap, *pool2->getOutput(0), 256, 3, 1, 1, 10); auto lr5 = convRelu(network, weightMap, *lr4->getOutput(0), 256, 3, 1, 1, 12); auto lr6 = convRelu(network, weightMap, *lr5->getOutput(0), 256, 3, 1, 1, 14); IPoolingLayer* pool3 = network->addPoolingNd(*lr6->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool3); pool3->setStrideNd(DimsHW{2, 2}); auto lr7 = convRelu(network, weightMap, *pool3->getOutput(0), 512, 3, 1, 1, 17); auto lr8 = convRelu(network, weightMap, *lr7->getOutput(0), 512, 3, 1, 1, 19); auto lr9 = convRelu(network, weightMap, *lr8->getOutput(0), 512, 3, 1, 1, 21); IPoolingLayer* pool4 = network->addPoolingNd(*lr9->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool4); pool4->setStrideNd(DimsHW{2, 2}); auto lr24 = convRelu(network, weightMap, *pool4->getOutput(0), 512, 3, 1, 1, 24); auto lr26 = convRelu(network, weightMap, *lr24->getOutput(0), 512, 3, 1, 1, 26); auto lr28 = convRelu(network, weightMap, *lr26->getOutput(0), 512, 3, 1, 1, 28); IPoolingLayer* pool5 = network->addPoolingNd(*lr28->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool5); pool5->setStrideNd(DimsHW{2, 2}); auto lr31 = convRelu(network, weightMap, *pool5->getOutput(0), 1024, 3, 1, 3, 31,"vgg.",true); //s_0 auto out_conv4_3_L2Norm = L2norm(network, weightMap, *lr9->getOutput(0),"conv4_3_L2Norm.weight"); //s_1 auto out_conv5_3_L2Norm = L2norm(network, weightMap, *lr28->getOutput(0),"conv5_3_L2Norm.weight"); //s_2 auto lr33 = convRelu(network, weightMap, *lr31->getOutput(0), 1024, 1, 1, 0, 33); auto extras0 = convRelu_extras(network, weightMap, *lr33->getOutput(0), 256, 1, 1, 0, "extras.0.weight", "extras.0.bias"); //s_3 auto extras1 = convRelu_extras(network, weightMap, *extras0->getOutput(0), 512, 3, 2, 1, "extras.1.weight", "extras.1.bias"); auto arm_loc_0 = conv_permutation(network, weightMap, *out_conv4_3_L2Norm->getOutput(0), 12, 3, 1, 1, "arm_loc.0.weight", "arm_loc.0.bias"); auto arm_loc_1 = conv_permutation(network, weightMap, *out_conv5_3_L2Norm->getOutput(0), 12, 3, 1, 1, "arm_loc.1.weight", "arm_loc.1.bias"); auto arm_loc_2 = conv_permutation(network, weightMap, *lr33->getOutput(0), 12, 3, 1, 1, "arm_loc.2.weight", "arm_loc.2.bias"); auto arm_loc_3 = conv_permutation(network, weightMap, *extras1->getOutput(0), 12, 3, 1, 1, "arm_loc.3.weight", "arm_loc.3.bias"); auto arm_conf_0 = conv_permutation(network, weightMap, *out_conv4_3_L2Norm->getOutput(0), 6, 3, 1, 1, "arm_conf.0.weight", "arm_conf.0.bias"); auto arm_conf_1 = conv_permutation(network, weightMap, *out_conv5_3_L2Norm->getOutput(0), 6, 3, 1, 1, "arm_conf.1.weight", "arm_conf.1.bias"); auto arm_conf_2 = conv_permutation(network, weightMap, *lr33->getOutput(0), 6, 3, 1, 1, "arm_conf.2.weight", "arm_conf.2.bias"); auto arm_conf_3 = conv_permutation(network, weightMap, *extras1->getOutput(0), 6, 3, 1, 1, "arm_conf.3.weight", "arm_conf.3.bias"); auto arm_loc = cat_4_tensor(network, arm_loc_0, arm_loc_1, arm_loc_2, arm_loc_3); auto arm_conf = cat_4_tensor(network, arm_conf_0, arm_conf_1, arm_conf_2, arm_conf_3); auto ss_0 = convReluconv_tcb0(network, weightMap, *extras1->getOutput(0), 256, 3, 1, 1, 9, 11); auto ss_00 = ReluconvRelu_tcb2(network, weightMap, *ss_0->getOutput(0), 256, 3, 1, 1, 10); auto ss_1 = convReluconv_tcb0(network, weightMap, *lr33->getOutput(0), 256, 3, 1, 1, 6, 8); IDeconvolutionLayer* tcb1_2 = network->addDeconvolutionNd(*ss_00->getOutput(0), 256, DimsHW{2, 2}, weightMap["tcb1.2.weight"], weightMap["tcb1.2.bias"]); //nn.ConvTranspose2d(256, 256, 2, 2) tcb1_2->setStrideNd(DimsHW{2, 2}); assert(tcb1_2); auto ss_1_add = network->addElementWise(*ss_1->getOutput(0), *tcb1_2->getOutput(0), ElementWiseOperation::kSUM); auto ss_11 = ReluconvRelu_tcb2(network, weightMap, *ss_1_add->getOutput(0), 256, 3, 1, 1, 7); auto ss_2 = convReluconv_tcb0(network, weightMap, *out_conv5_3_L2Norm->getOutput(0), 256, 3, 1, 1, 3, 5); IDeconvolutionLayer* tcb1_1 = network->addDeconvolutionNd(*ss_11->getOutput(0), 256, DimsHW{2, 2}, weightMap["tcb1.1.weight"], weightMap["tcb1.1.bias"]); //nn.ConvTranspose2d(256, 256, 2, 2) tcb1_1->setStrideNd(DimsHW{2, 2}); assert(tcb1_1); auto ss_2_add = network->addElementWise(*ss_2->getOutput(0), *tcb1_1->getOutput(0), ElementWiseOperation::kSUM); auto ss_22 = ReluconvRelu_tcb2(network, weightMap, *ss_2_add->getOutput(0), 256, 3, 1, 1, 4); auto ss_3 = convReluconv_tcb0(network, weightMap, *out_conv4_3_L2Norm->getOutput(0), 256, 3, 1, 1, 0, 2); IDeconvolutionLayer* tcb1_0 = network->addDeconvolutionNd(*ss_22->getOutput(0), 256, DimsHW{2, 2}, weightMap["tcb1.0.weight"], weightMap["tcb1.0.bias"]); //nn.ConvTranspose2d(256, 256, 2, 2) tcb1_0->setStrideNd(DimsHW{2, 2}); assert(tcb1_0); auto ss_3_add = network->addElementWise(*ss_3->getOutput(0), *tcb1_0->getOutput(0), ElementWiseOperation::kSUM); auto ss_33 = ReluconvRelu_tcb2(network, weightMap, *ss_3_add->getOutput(0), 256, 3, 1, 1, 1); auto odm_loc_0 = conv_permutation(network, weightMap, *ss_33->getOutput(0), 12, 3, 1, 1, "odm_loc.0.weight", "odm_loc.0.bias"); auto odm_loc_1 = conv_permutation(network, weightMap, *ss_22->getOutput(0), 12, 3, 1, 1, "odm_loc.1.weight", "odm_loc.1.bias"); auto odm_loc_2 = conv_permutation(network, weightMap, *ss_11->getOutput(0), 12, 3, 1, 1, "odm_loc.2.weight", "odm_loc.2.bias"); auto odm_loc_3 = conv_permutation(network, weightMap, *ss_00->getOutput(0), 12, 3, 1, 1, "odm_loc.3.weight", "odm_loc.3.bias"); auto odm_conf_0 = conv_permutation(network, weightMap, *ss_33->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.0.weight", "odm_conf.0.bias"); auto odm_conf_1 = conv_permutation(network, weightMap, *ss_22->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.1.weight", "odm_conf.1.bias"); auto odm_conf_2 = conv_permutation(network, weightMap, *ss_11->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.2.weight", "odm_conf.2.bias"); auto odm_conf_3 = conv_permutation(network, weightMap, *ss_00->getOutput(0), 3 * num_class, 3, 1, 1, "odm_conf.3.weight", "odm_conf.3.bias"); auto odm_loc = cat_4_tensor(network, odm_loc_0, odm_loc_1, odm_loc_2, odm_loc_3); auto odm_conf = cat_4_tensor(network, odm_conf_0, odm_conf_1, odm_conf_2, odm_conf_3); //25500 Dims dim = arm_loc->getOutput(0)->getDimensions(); std::cout <<"debug arm_loc dim==" << dim.d[0] << " " << dim.d[1] << " " << dim.d[2] << " " << dim.d[3] << std::endl; arm_loc->getOutput(0)->setName(OUTPUT_BLOB_NAME_arm_loc); network->markOutput(*arm_loc->getOutput(0)); auto arm_conf_111 = reshapeSoftmax(network, *arm_conf->getOutput(0), 2); //12750 Dims dim2 = arm_conf_111->getOutput(0)->getDimensions(); std::cout <<"debug arm_conf dim==" << dim2.d[0] << " " << dim2.d[1] << " " << dim2.d[2] << " " << dim2.d[3] << std::endl; arm_conf_111->getOutput(0)->setName(OUTPUT_BLOB_NAME_arm_conf); network->markOutput(*arm_conf_111->getOutput(0)); //25500 Dims dim3 = odm_loc->getOutput(0)->getDimensions(); std::cout <<"debug odm_loc dim==" << dim3.d[0] << " " << dim3.d[1] << " " << dim3.d[2] << " " << dim3.d[3] << std::endl; odm_loc->getOutput(0)->setName(OUTPUT_BLOB_NAME_odm_loc); network->markOutput(*odm_loc->getOutput(0)); //159375 Dims dim4 = odm_conf->getOutput(0)->getDimensions(); odm_conf = reshapeSoftmax(network, *odm_conf->getOutput(0), 25); std::cout <<"debug odm_conf dim==" << dim4.d[0] << " " << dim4.d[1] << " " << dim4.d[2] << " " << dim4.d[3] << std::endl; odm_conf->getOutput(0)->setName(OUTPUT_BLOB_NAME_odm_conf); network->markOutput(*odm_conf->getOutput(0)); builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } torch::Tensor PriorBox() { std::vector mean; std::vector feature_maps = {40,20,10,5}; int image_size = 320; std::vector steps = {8,16,32,64}; std::vector min_sizes = {32,64,128,256}; std::vector aspect_ratios = {2,2,2,2}; for(int k=0;k variance({0.1,0.2}); torch::Tensor top_2 = torch::tensor({0,1}).cuda().to(torch::kLong); torch::Tensor bottom_2 = torch::tensor({2,3}).cuda().to(torch::kLong); auto c1 = _prior.index_select(1,top_2)+_loc.index_select(1,top_2).mul(variance[0])*_prior.index_select(1,bottom_2); auto c2 = _prior.index_select(1,bottom_2)*torch::exp(_loc.index_select(1,bottom_2)*variance[1]); auto _retv = torch::cat({c1,c2},1); if(b_form_pt) { auto c3 = _retv.index_select(1,top_2)-_retv.index_select(1,bottom_2).div(2); auto c4 = c3 + _retv.index_select(1,bottom_2); return torch::cat({c3,c4},1); } else { return _retv; } } torch::Tensor center(torch::Tensor retv) { auto c1 = retv.select(1,0).unsqueeze(1); auto c2 = retv.select(1,1).unsqueeze(1); auto c3 = retv.select(1,2).unsqueeze(1); auto c4 = retv.select(1,3).unsqueeze(1); auto _retv = torch::cat({(c1+c3).div(2),(c2+c4).div(2),c3-c1,c4-c2},1); return _retv; } bool nms(const torch::Tensor& boxes, const torch::Tensor& scores, torch::Tensor &keep, int &count,float overlap, int top_k) { count =0; keep = torch::zeros({scores.size(0)}).to(torch::kLong).to(scores.device()); if(0 == boxes.numel()) { return false; } torch::Tensor x1 = boxes.select(1,0).clone(); torch::Tensor y1 = boxes.select(1,1).clone(); torch::Tensor x2 = boxes.select(1,2).clone(); torch::Tensor y2 = boxes.select(1,3).clone(); torch::Tensor area = (x2-x1)*(y2-y1); // std::cout< sort_ret = torch::sort(scores.unsqueeze(1), 0, 0); torch::Tensor v = std::get<0>(sort_ret).squeeze(1).to(scores.device()); torch::Tensor idx = std::get<1>(sort_ret).squeeze(1).to(scores.device()); int num_ = idx.size(0); if(num_ > top_k) //python:idx = idx[-top_k:] { idx = idx.slice(0,num_-top_k,num_).clone(); } torch::Tensor xx1,yy1,xx2,yy2,w,h; while(idx.numel() > 0) { auto i = idx[-1]; keep[count] = i; count += 1; if(1 == idx.size(0)) { break; } idx = idx.slice(0,0,idx.size(0)-1).clone(); xx1 = x1.index_select(0,idx); yy1 = y1.index_select(0,idx); xx2 = x2.index_select(0,idx); yy2 = y2.index_select(0,idx); xx1 = xx1.clamp(x1[i].item().toFloat(),INT_MAX*1.0); yy1 = yy1.clamp(y1[i].item().toFloat(),INT_MAX*1.0); xx2 = xx2.clamp(INT_MIN*1.0,x2[i].item().toFloat()); yy2 = yy2.clamp(INT_MIN*1.0,y2[i].item().toFloat()); w = xx2 - xx1; h = yy2 - yy1; w = w.clamp(0,INT_MAX); h = h.clamp(0,INT_MAX); torch::Tensor inter = w * h; torch::Tensor rem_areas = area.index_select(0,idx); torch::Tensor union_ = (rem_areas - inter) + area[i]; torch::Tensor Iou = inter * 1.0 / union_; torch::Tensor index_small = Iou < overlap; auto mask_idx = torch::nonzero(index_small).squeeze(); idx = idx.index_select(0,mask_idx);//pthon: idx = idx[IoU.le(overlap)] } return true; } void doInference(IExecutionContext& context, void* buffers[], cudaStream_t &stream, float* input, std::vector> &detections) { auto start_infer = std::chrono::system_clock::now(); detections.clear(); int batchSize = 1; const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. // std::cout<<"engine.getNbBindings()==="<(end_infer - start_infer).count(); std::cout <<"time consume context.enqueue===" << during_time << "ms" << std::endl; auto start_houchuli = std::chrono::system_clock::now(); int m_prior_size = 6375; torch::Tensor m_prior = PriorBox(); torch::Tensor arm_loc = torch::from_blob(buffers[outputIndex_arm_loc],{m_prior_size,4}).cuda().toType(torch::kFloat64).unsqueeze(0); torch::Tensor arm_conf = torch::from_blob(buffers[outputIndex_arm_conf],{m_prior_size,2}).cuda().toType(torch::kFloat64).unsqueeze(0); torch::Tensor odm_loc = torch::from_blob(buffers[outputIndex_odm_loc],{m_prior_size,4}).cuda().toType(torch::kFloat64).unsqueeze(0); torch::Tensor odm_conf = torch::from_blob(buffers[outputIndex_odm_conf],{m_prior_size,25}).cuda().toType(torch::kFloat64).unsqueeze(0); float obj_threshed = 0.01; torch::Tensor arm_object_conf = arm_conf.squeeze(0).select(1,1); torch::Tensor object_index = arm_object_conf > obj_threshed; object_index=object_index.unsqueeze(1); torch::Tensor object_index_1 = object_index.expand_as(odm_conf.squeeze(0)).toType(torch::kFloat64); auto filter_odm_conf = odm_conf.squeeze(0).toType(torch::kFloat64) * object_index_1; torch::Tensor conf_preds_ = filter_odm_conf.clone().toType(torch::kFloat64); torch::Tensor conf_preds = conf_preds_.transpose(1,0).toType(torch::kFloat64); torch::Tensor default_m = decode(arm_loc[0],m_prior); // default_m = center(default_m); bool b_form_pt = true; torch::Tensor decode_boxes_m = decode(odm_loc[0],default_m,b_form_pt);//6375,4 float conf_thresh = 0.01; float mask_thresh = 0.01; torch::Tensor result_out; for(int i=1;i<25;i++) { torch::Tensor c_mask_m = conf_preds[i] > mask_thresh; torch::Tensor nonzero_index = torch::nonzero(c_mask_m); torch::Tensor score_m = torch::index_select(conf_preds[i],0,nonzero_index.squeeze(1)); torch::Tensor boxes_m = torch::index_select(decode_boxes_m,0,nonzero_index.squeeze(1)); torch::Tensor keep; int count = 0; float overlap = 0.45; int top_k=1000; nms(boxes_m, score_m, keep, count, overlap, top_k); if(0 == count) { continue; } keep = keep.slice(0,0,count).clone(); torch::Tensor score_my = score_m.index_select(0,keep); torch::Tensor boxes_my = boxes_m.index_select(0,keep); if(score_my[0].item().toFloat() < conf_thresh) { continue; } // boxes_my.select(1,0).mul_(width); // boxes_my.select(1,1).mul_(height); // boxes_my.select(1,2).mul_(width); // boxes_my.select(1,3).mul_(height); torch::Tensor label_tensor = torch::full_like(score_my.unsqueeze(1),i); torch::Tensor result_ = torch::cat({boxes_my.toType(torch::kFloat64),score_my.unsqueeze(1).toType(torch::kFloat64),label_tensor.toType(torch::kFloat64)},1); if(0 == result_out.numel()) { result_out = result_.clone(); }else { result_out = torch::cat({result_out,result_},0);//Splicing by line } } if(0 == result_out.numel()) { std::cout<<"libtorch refinedet obj_small: nothing detect!"<(); for(int i=0;i v_detections; v_detections.push_back(0); //image_id v_detections.push_back(id_label); //label v_detections.push_back(score); //score v_detections.push_back(x1); //xmin v_detections.push_back(y1); //ymin v_detections.push_back(x2); //xmax v_detections.push_back(y2); //ymax detections.push_back(v_detections); } cudaDeviceSynchronize(); auto end_houchuli = std::chrono::system_clock::now(); double during_time_houchuli = std::chrono::duration_cast(end_houchuli - start_houchuli).count(); std::cout <<"time consume houchuli===" << during_time_houchuli << "ms" << std::endl; } void base_transform(const cv::Mat &m_src,float *data) { cv::Mat image; cv::resize(m_src,image,cv::Size(INPUT_W,INPUT_H)); if(1 == image.channels()) { cv::cvtColor(image,image,CV_GRAY2BGR); } for(int i=0;i(i); //Get the first address of the row pointer for(int j=0;j(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; #elif defined INFER std::ifstream file(path_engine, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } #else std::cerr << "arguments not right!" << std::endl; std::cerr << "configure.h should difine SERIALIZE INFER" << std::endl; std::cerr << "please check!" << std::endl; return -1; #endif std::vector file_names; if (read_files_in_dir(p_dir_name, file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- float data[3 * INPUT_H * INPUT_W]; IRuntime* runtime = createInferRuntime(gLogger); //400M assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); //777M assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); //971M assert(context != nullptr); delete[] trtModelStream; const int batchSize = 1; const int inputIndex=0; const int outputIndex_arm_loc=1; const int outputIndex_arm_conf=3; const int outputIndex_odm_loc=2; const int outputIndex_odm_conf=4; //Initialize cuda memory: input and 4 output memory void* buffers[5]; // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[0], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); const int OUTPUT_SIZE_arm_loc = 25500; //40*40*12 + 20*20*12 + 10*10*12 + 5*5*12 = 25500 (Fixed value) CUDA_CHECK(cudaMalloc(&buffers[outputIndex_arm_loc], batchSize * OUTPUT_SIZE_arm_loc * sizeof(float))); const int OUTPUT_SIZE_arm_conf = 12750; //40*40*6 + 20*20*6 + 10*10*6 + 5*5*6 = 12750 (Fixed value) CUDA_CHECK(cudaMalloc(&buffers[outputIndex_arm_conf], batchSize * OUTPUT_SIZE_arm_conf * sizeof(float))); const int OUTPUT_SIZE_odm_loc = 25500; //40*40*12 + 20*20*12 + 10*10*12 + 5*5*12 = 25500 (Fixed value) CUDA_CHECK(cudaMalloc(&buffers[outputIndex_odm_loc], batchSize * OUTPUT_SIZE_odm_loc * sizeof(float))); const int OUTPUT_SIZE_odm_conf = 159375; //40*40*(num_class*3) + 20*20**(num_class*3) + 10*10**(num_class*3) + 5*5**(num_class*3) //here num_class=25// =159375 CUDA_CHECK(cudaMalloc(&buffers[outputIndex_odm_conf], batchSize * OUTPUT_SIZE_odm_conf * sizeof(float))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); int fcount = 0; auto t_0 = std::chrono::steady_clock::now(); for (auto f: file_names) { fcount++; std::cout << "\n" << fcount << " " << f << std::endl; std::cout << std::string(p_dir_name) + "/" + f << std::endl; auto start_read = std::chrono::system_clock::now(); cv::Mat img = cv::imread(std::string(p_dir_name) + "/" + f); cudaDeviceSynchronize(); auto end_read = std::chrono::system_clock::now(); double during_time_read = std::chrono::duration_cast(end_read - start_read).count(); std::cout <<"time consume during_time_read===" << during_time_read << "ms" << std::endl; if (img.empty()) continue; auto start_yuchuli = std::chrono::system_clock::now(); base_transform(img,data); cudaDeviceSynchronize(); auto end_yuchuli = std::chrono::system_clock::now(); double during_time_yuchuli = std::chrono::duration_cast(end_yuchuli - start_yuchuli).count(); std::cout <<"time consume base_transform===" << during_time_yuchuli << "ms" << std::endl; auto start_doInfer = std::chrono::system_clock::now(); std::vector> detections; doInference(*context, buffers, stream, data, detections); cudaDeviceSynchronize(); auto end_doInfer = std::chrono::system_clock::now(); double during_doinfer = std::chrono::duration_cast(end_doInfer - start_doInfer).count(); std::cout <<"time consume doInference===" << during_doinfer << "ms" << std::endl; /* Print the detection results. */ for (size_t i = 0; i < detections.size(); ++i) { const std::vector &d = detections[i]; CHECK_EQ(d.size(), 7); const float score = d[2]; int label = int(d[1]); if (label >= num_class || label < 0) { std::cout << "label_Error!" << std::endl; continue; } if(score < TH) { continue; } cv::Rect r; r.x = d[3] * img.cols; r.y = d[4] * img.rows; r.width = d[5] * img.cols - r.x; r.height = d[6] * img.rows - r.y; RoiCorrect(img, r); if(T_show) { cv::rectangle(img,r,cv::Scalar(255,0,0),2); } if (T_show == 0) { std::string name_1 = f.substr(0,f.size()-4); std::string path_txt = save_path_txt + name_1 + ".txt"; std::ofstream fout(path_txt); fout << label_map[label] << " " << score << " " << r.x << " " << r.y << " " << r.x + r.width << " " << r.y + r.height << std::endl; //使用自己的label } } if(T_show) { cv::namedWindow("show",0); cv::imshow("show",img); cv::waitKey(0); } } cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex_arm_loc])); CUDA_CHECK(cudaFree(buffers[outputIndex_arm_conf])); CUDA_CHECK(cudaFree(buffers[outputIndex_odm_loc])); CUDA_CHECK(cudaFree(buffers[outputIndex_odm_conf])); cudaDeviceSynchronize(); auto ttt = std::chrono::duration_cast (std::chrono::steady_clock::now() - t_0).count(); std::cout << "all consume time="<destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: refinedet/utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } #endif ================================================ FILE: repvgg/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(repvgg) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(repvgg ${PROJECT_SOURCE_DIR}/repvgg.cpp) target_link_libraries(repvgg nvinfer) target_link_libraries(repvgg cudart) add_definitions(-O2 -pthread) ================================================ FILE: repvgg/README.md ================================================ # RepVGG RepVGG models from "RepVGG: Making VGG-style ConvNets Great Again" For the Pytorch implementation, you can refer to [DingXiaoH/RepVGG](https://github.com/DingXiaoH/RepVGG) # How to run 1. generate wts file. ``` git clone https://github.com/DingXiaoH/RepVGG.git cd ReoVGG ``` You may convert a trained model into the inference-time structure with ``` python convert.py [weights file of the training-time model to load] [path to save] -a [model name] ``` For example, ``` python convert.py RepVGG-B2-train.pth RepVGG-B2-deploy.pth -a RepVGG-B2 ``` Then copy `gen_wts.py` to `RepVGG` and generate .wts file, for example ``` python gen_wts.py -w RepVGG-B2-deploy.pth -s RepVGG-B2.wts ``` 2. build and run ``` cd tensorrtx/repvgg mkdir build cd build cmake .. make sudo ./repvgg -s RepVGG-B2 // serialize model to plan file i.e. 'RepVGG-B2.engine' sudo ./repvgg -d RepVGG-B2 // deserialize plan file and run inference ``` ================================================ FILE: repvgg/gen_wts.py ================================================ import argparse import struct import torch def main(args): # Load model state_dict = torch.load(args.weight) with open(args.save_path, "w") as f: f.write("{}\n".format(len(state_dict.keys()))) for k, v in state_dict.items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {} ".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-w", "--weight", type=str, required=True, help="RepVGG model weight path", ) parser.add_argument( "-s", "--save_path", type=str, required=True, help="generated wts path", ) args = parser.parse_args() main(args) ================================================ FILE: repvgg/logging.h ================================================ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include // Logger for TensorRT info/warning/errors class Logger : public nvinfer1::ILogger { public: Logger() : Logger(Severity::kINFO) {} Logger(Severity severity) : reportableSeverity(severity) {} void log(Severity severity, const char *msg) override { // suppress messages with severity enum value greater than the reportable if (severity > reportableSeverity) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cerr << "INFO: "; break; default: std::cerr << "UNKNOWN: "; break; } std::cerr << msg << std::endl; } Severity reportableSeverity{Severity::kWARNING}; }; #endif // TENSORRT_LOGGING_H ================================================ FILE: repvgg/repvgg.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #include #define CHECK(status) \ do \ { \ auto ret = (status); \ if (ret != 0) \ { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) // stuff we know about the network and the input/output blobs #define MAX_BATCH_SIZE 1 const std::vector groupwise_layers{2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26}; const std::map groupwise_counts = { {"RepVGG-A0", 1}, {"RepVGG-A1", 1}, {"RepVGG-A2", 1}, {"RepVGG-B0", 1}, {"RepVGG-B1", 1}, {"RepVGG-B1g2", 2}, {"RepVGG-B1g4", 4}, {"RepVGG-B2", 1}, {"RepVGG-B2g2", 2}, {"RepVGG-B2g4", 4}, {"RepVGG-B3", 1}, {"RepVGG-B3g2", 2}, {"RepVGG-B3g4", 4}}; const std::map> num_blocks = { {"RepVGG-A0", {2, 4, 14, 1}}, {"RepVGG-A1", {2, 4, 14, 1}}, {"RepVGG-A2", {2, 4, 14, 1}}, {"RepVGG-B0", {4, 6, 16, 1}}, {"RepVGG-B1", {4, 6, 16, 1}}, {"RepVGG-B1g2", {4, 6, 16, 1}}, {"RepVGG-B1g4", {4, 6, 16, 1}}, {"RepVGG-B2", {4, 6, 16, 1}}, {"RepVGG-B2g2", {4, 6, 16, 1}}, {"RepVGG-B2g4", {4, 6, 16, 1}}, {"RepVGG-B3", {4, 6, 16, 1}}, {"RepVGG-B3g2", {4, 6, 16, 1}}, {"RepVGG-B3g4", {4, 6, 16, 1}}}; const std::map> width_multiplier = { {"RepVGG-A0", {0.75, 0.75, 0.75, 2.5}}, {"RepVGG-A1", {1, 1, 1, 2.5}}, {"RepVGG-A2", {1.5, 1.5, 1.5, 2.75}}, {"RepVGG-B0", {1, 1, 1, 2.5}}, {"RepVGG-B1", {2, 2, 2, 4}}, {"RepVGG-B1g2", {2, 2, 2, 4}}, {"RepVGG-B1g4", {2, 2, 2, 4}}, {"RepVGG-B2", {2.5, 2.5, 2.5, 5}}, {"RepVGG-B2g2", {2.5, 2.5, 2.5, 5}}, {"RepVGG-B2g4", {2.5, 2.5, 2.5, 5}}, {"RepVGG-B3", {3, 3, 3, 5}}, {"RepVGG-B3g2", {3, 3, 3, 5}}, {"RepVGG-B3g4", {3, 3, 3, 5}}}; static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char *INPUT_BLOB_NAME = "data"; const char *OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } std::cout << "Finished Load weights: " << file << std::endl; return weightMap; } IActivationLayer *RepVGGBlock(INetworkDefinition *network, std::map &weightMap, ITensor &input, int inch, int outch, int stride, int groups, std::string lname) { IConvolutionLayer *conv = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + "rbr_reparam.weight"], weightMap[lname + "rbr_reparam.bias"]); conv->setStrideNd(DimsHW{stride, stride}); conv->setPaddingNd(DimsHW{1, 1}); conv->setNbGroups(groups); assert(conv); IActivationLayer *relu = network->addActivation(*conv->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } IActivationLayer *makeStage(INetworkDefinition *network, std::map &weightMap, int &layer_idx, const int group_count, ITensor &input, int inch, int outch, int stride, int blocks, std::string lname) { IActivationLayer *layer; for (int i = 0; i < blocks; ++i) { int group = 1; if (std::find(groupwise_layers.begin(), groupwise_layers.end(), layer_idx) != groupwise_layers.end()) group = group_count; if (i == 0) layer = RepVGGBlock(network, weightMap, input, inch, outch, 2, group, lname + std::to_string(i) + "."); else layer = RepVGGBlock(network, weightMap, *layer->getOutput(0), inch, outch, 1, group, lname + std::to_string(i) + "."); layer_idx += 1; } return layer; } // Creat the engine using only the API and not any parser. ICudaEngine *createEngine(std::string netName, unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt) { const std::vector blocks = num_blocks.at(netName); const std::vector widths = width_multiplier.at(netName); const int group_count = groupwise_counts.at(netName); int layer_idx = 1; std::map weightMap = loadWeights("../" + netName + ".wts"); INetworkDefinition *network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); int in_planes = std::min(64, int(64 * widths[0])); auto stage0 = RepVGGBlock(network, weightMap, *data, 3, in_planes, 2, 1, "stage0."); assert(stage0); auto stage1 = makeStage(network, weightMap, layer_idx, group_count, *stage0->getOutput(0), in_planes, int(64 * widths[0]), 2, blocks[0], "stage1."); assert(stage1); auto stage2 = makeStage(network, weightMap, layer_idx, group_count, *stage1->getOutput(0), int(64 * widths[0]), int(128 * widths[1]), 2, blocks[1], "stage2."); assert(stage2); auto stage3 = makeStage(network, weightMap, layer_idx, group_count, *stage2->getOutput(0), int(128 * widths[1]), int(256 * widths[2]), 2, blocks[2], "stage3."); assert(stage3); auto stage4 = makeStage(network, weightMap, layer_idx, group_count, *stage3->getOutput(0), int(256 * widths[2]), int(512 * widths[3]), 2, blocks[3], "stage4."); assert(stage4); IPoolingLayer *pool = network->addPoolingNd(*stage4->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); pool->setStrideNd(DimsHW{7, 7}); pool->setPaddingNd(DimsHW{0, 0}); assert(pool); IFullyConnectedLayer *linear = network->addFullyConnected(*pool->getOutput(0), 1000, weightMap["linear.weight"], weightMap["linear.bias"]); assert(linear); linear->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*linear->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto &mem : weightMap) { free((void *)(mem.second.values)); } return engine; } void APIToModel(std::string netName, unsigned int maxBatchSize, IHostMemory **modelStream) { // Create builder IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = createEngine(netName, maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext &context, float *input, float *output, int batchSize) { const ICudaEngine &engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void *buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char **argv) { if (argc != 3) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./repvgg -s RepVGG-B1g2 // serialize model to plan file" << std::endl; std::cerr << "./repvgg -d RepVGG-B1g2 // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { std::string netName = std::string(argv[2]); IHostMemory *modelStream{nullptr}; APIToModel(netName, MAX_BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p(netName + ".engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::string netName = std::string(argv[2]); std::ifstream file(netName + ".engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime *runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext *context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ", "; } std::cout << std::endl; return 0; } ================================================ FILE: resnet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(resnet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(resnet18 ${PROJECT_SOURCE_DIR}/resnet18.cpp) target_link_libraries(resnet18 nvinfer) target_link_libraries(resnet18 cudart) add_executable(resnet34 ${PROJECT_SOURCE_DIR}/resnet34.cpp) target_link_libraries(resnet34 nvinfer) target_link_libraries(resnet34 cudart) add_executable(resnet50 ${PROJECT_SOURCE_DIR}/resnet50.cpp) target_link_libraries(resnet50 nvinfer) target_link_libraries(resnet50 cudart) add_executable(resnext50 ${PROJECT_SOURCE_DIR}/resnext50_32x4d.cpp) target_link_libraries(resnext50 nvinfer) target_link_libraries(resnext50 cudart) add_executable(wideresnet50 ${PROJECT_SOURCE_DIR}/wideresnet50.cpp) target_link_libraries(wideresnet50 nvinfer) target_link_libraries(wideresnet50 cudart) add_definitions(-O2 -pthread) ================================================ FILE: resnet/README.md ================================================ # resnet ResNet-18 and ResNet-50 model from "Deep Residual Learning for Image Recognition" For the Pytorch implementation, you can refer to [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet) Wide Resnet-50 model from "Wide Residual Networks" . For the Pytorch implementation, you can refer to [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz) Following tricks are used in this resnet, nothing special, residual connection and batchnorm are used. - Batchnorm layer, implemented with scale layer. ## TensorRT C++ API ``` // 1a. generate resnet18.wts,resnet34.wts or resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet) // 1b. generate wide_resnet50.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz) // 2. put resnet18.wts,resnet34 or resnet50.wts into tensorrtx/resnet // 3. build and run cd tensorrtx/resnet mkdir build cd build cmake .. make sudo ./resnet18 -s // serialize model to plan file i.e. 'resnet18.engine' sudo ./resnet18 -d // deserialize plan file and run inference or sudo ./resnet34 -s // serialize model to plan file i.e. 'resnet34.engine' sudo ./resnet34 -d // deserialize plan file and run inference or sudo ./resnet50 -s // serialize model to plan file i.e. 'resnet50.engine' sudo ./resnet50 -d // deserialize plan file and run inference or sudo ./resnext50 -s // serialize model to plan file i.e. 'resnext50.engine' sudo ./resnext50 -d // deserialize plan file and run inference or sudo ./wide_resnet50 -s // serialize model to plan file i.e. 'wide_resnet50.engine' sudo ./wide_resnet50 -d // deserialize plan file and run inference // 4. see if the output is same as - [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet) - for resnet18, resnet34, resnet50, resnext50 - [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz) - for wide_resnet50 ``` ### TensorRT Python API ``` # 1a. generate resnet50.wts from [pytorchx/resnet](https://github.com/wang-xinyu/pytorchx/tree/master/resnet) # 1b. generate wide_resnet50.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz) # 2. put resnet50.wts or wide_resnet50.wts into tensorrtx/resnet # 3. install Python dependencies (tensorrt/pycuda/numpy) cd tensorrtx/resnet python resnet50.py -s // serialize model to plan file i.e. 'resnet50.engine' python resnet50.py -d // deserialize plan file and run inference or python wide_resnet50.py -s // serialize model to plan file i.e. 'wide_resnet50.engine' python wide_resnet50.py -d // deserialize plan file and run inference # 4. see if the output is same as - pytorchx/resnet - for resnet50 - BlueMirrors/torchtrtz - for wide_resnet50 ``` ================================================ FILE: resnet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: resnet/resnet18.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* basicBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{3, 3}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{stride, stride}); conv1->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IElementWiseLayer* ew1; if (inch != outch) { IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu2); return relu2; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../resnet18.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0."); IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "layer1.1."); IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 128, 2, "layer2.0."); IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 128, 128, 1, "layer2.1."); IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 256, 2, "layer3.0."); IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 256, 256, 1, "layer3.1."); IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 256, 512, 2, "layer4.0."); IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 512, 512, 1, "layer4.1."); IPoolingLayer* pool2 = network->addPoolingNd(*relu9->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool2); pool2->setStrideNd(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./resnet18 -s // serialize model to plan file" << std::endl; std::cerr << "./resnet18 -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("resnet18.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("resnet18.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ", "; } std::cout << std::endl; return 0; } ================================================ FILE: resnet/resnet34.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if(ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while(0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weigths files have a simple space delimited format: // [tpyt] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::mapweightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalis weight map file"); while (count--) { Weights wt{ DataType::kFLOAT, nullptr,0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val)* size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *bata = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for(int i=0; i < len; i++){ scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT,scval,len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = bata[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* basicBlock(INetworkDefinition* network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ 3,3 }, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ stride,stride }); conv1->setPaddingNd(DimsHW{ 1,1 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3,3 }, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setPaddingNd(DimsHW{ 1,1 }); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IElementWiseLayer* ew1; if (inch != outch) { IConvolutionLayer* conv3 = network->addConvolutionNd(input, outch, DimsHW{ 1,1 }, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv3); conv3->setStrideNd(DimsHW{ stride, stride }); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM); }else { ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu2); return relu2; } // Create the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shpae { 3, INPUT_H INPPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3,INPUT_H,INPUT_W }); assert(data); std::map weightMap = loadWeights("../resnet34.wts"); Weights emptywts{ DataType::kFLOAT,nullptr,0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{ 7,7 }, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ 2,2 }); conv1->setPaddingNd(DimsHW{ 3,3 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{ 3,3 }); assert(pool1); pool1->setStrideNd(DimsHW{ 2,2 }); pool1->setPaddingNd(DimsHW{ 1,1 }); IActivationLayer* relu2 = basicBlock(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0."); IActivationLayer* relu3 = basicBlock(network, weightMap, *relu2->getOutput(0), 64, 64, 1, "layer1.1."); IActivationLayer* relu4 = basicBlock(network, weightMap, *relu3->getOutput(0), 64, 64, 1, "layer1.2."); IActivationLayer* relu5 = basicBlock(network, weightMap, *relu4->getOutput(0), 64, 128, 2, "layer2.0."); IActivationLayer* relu6 = basicBlock(network, weightMap, *relu5->getOutput(0), 128, 128, 1, "layer2.1."); IActivationLayer* relu7 = basicBlock(network, weightMap, *relu6->getOutput(0), 128, 128, 1, "layer2.2."); IActivationLayer* relu8 = basicBlock(network, weightMap, *relu7->getOutput(0), 128, 128, 1, "layer2.3."); IActivationLayer* relu9 = basicBlock(network, weightMap, *relu8->getOutput(0), 128, 256, 2, "layer3.0."); IActivationLayer* relu10 = basicBlock(network, weightMap, *relu9->getOutput(0), 256, 256, 1, "layer3.1."); IActivationLayer* relu11 = basicBlock(network, weightMap, *relu10->getOutput(0), 256, 256, 1, "layer3.2."); IActivationLayer* relu12 = basicBlock(network, weightMap, *relu11->getOutput(0), 256, 256, 1, "layer3.3."); IActivationLayer* relu13 = basicBlock(network, weightMap, *relu12->getOutput(0), 256, 256, 1, "layer3.4."); IActivationLayer* relu14 = basicBlock(network, weightMap, *relu13->getOutput(0), 256, 256, 1, "layer3.5."); IActivationLayer* relu15 = basicBlock(network, weightMap, *relu14->getOutput(0), 256, 512, 2, "layer4.0."); IActivationLayer* relu16 = basicBlock(network, weightMap, *relu15->getOutput(0), 512, 512, 1, "layer4.1."); IActivationLayer* relu17 = basicBlock(network, weightMap, *relu16->getOutput(0), 512, 512, 1, "layer4.2."); IPoolingLayer* pool2 = network->addPoolingNd(*relu17->getOutput(0), PoolingType::kAVERAGE, DimsHW{ 7,7 }); assert(pool2); pool2->setStrideNd(DimsHW{ 1,1 }); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBingdings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to konow the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H* INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./resnet34 -s // serialize model to plan file" << std::endl; std::cerr << "./resnet34 -d // desrialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a stream char *trtModelStream{ nullptr }; size_t size(0); if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{ nullptr }; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("resnet34.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; }else if (std::string(argv[1]) == "-d") { std::ifstream file("resnet34.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } }else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) { data[i] = 1.0; } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print historgram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ","; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ","; } std::cout << std::endl; return 0; } ================================================ FILE: resnet/resnet50.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 4) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../resnet50.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // Add max pooling layer with stride of 2x2 and kernel size of 2x2. IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2."); IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool2); pool2->setStrideNd(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./resnet -s // serialize model to plan file" << std::endl; std::cerr << "./resnet -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("resnet50.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("resnet50.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ", "; } std::cout << std::endl; return 0; } ================================================ FILE: resnet/resnet50.py ================================================ import argparse import os import struct import sys import numpy as np import pycuda.autoinit # noqa import pycuda.driver as cuda import tensorrt as trt BATCH_SIZE = 1 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 1000 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" EPS = 1e-5 WEIGHT_PATH = "./resnet50.wts" ENGINE_PATH = "./resnet50.engine" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def addBatchNorm2d(network, weight_map, input, layer_name, eps): gamma = weight_map[layer_name + ".weight"] beta = weight_map[layer_name + ".bias"] mean = weight_map[layer_name + ".running_mean"] var = weight_map[layer_name + ".running_var"] var = np.sqrt(var + eps) scale = gamma / var shift = -mean / var * gamma + beta return network.add_scale(input=input, mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale) def bottleneck(network, weight_map, input, in_channels, out_channels, stride, layer_name): conv1 = network.add_convolution(input=input, num_output_maps=out_channels, kernel_shape=(1, 1), kernel=weight_map[layer_name + "conv1.weight"], bias=trt.Weights()) assert conv1 bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), layer_name + "bn1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 conv2 = network.add_convolution(input=relu1.get_output(0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=weight_map[layer_name + "conv2.weight"], bias=trt.Weights()) assert conv2 conv2.stride = (stride, stride) conv2.padding = (1, 1) bn2 = addBatchNorm2d(network, weight_map, conv2.get_output(0), layer_name + "bn2", EPS) assert bn2 relu2 = network.add_activation(bn2.get_output(0), type=trt.ActivationType.RELU) assert relu2 conv3 = network.add_convolution(input=relu2.get_output(0), num_output_maps=out_channels * 4, kernel_shape=(1, 1), kernel=weight_map[layer_name + "conv3.weight"], bias=trt.Weights()) assert conv3 bn3 = addBatchNorm2d(network, weight_map, conv3.get_output(0), layer_name + "bn3", EPS) assert bn3 if stride != 1 or in_channels != 4 * out_channels: conv4 = network.add_convolution( input=input, num_output_maps=out_channels * 4, kernel_shape=(1, 1), kernel=weight_map[layer_name + "downsample.0.weight"], bias=trt.Weights()) assert conv4 conv4.stride = (stride, stride) bn4 = addBatchNorm2d(network, weight_map, conv4.get_output(0), layer_name + "downsample.1", EPS) assert bn4 ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0), trt.ElementWiseOperation.SUM) else: ew1 = network.add_elementwise(input, bn3.get_output(0), trt.ElementWiseOperation.SUM) assert ew1 relu3 = network.add_activation(ew1.get_output(0), type=trt.ActivationType.RELU) assert relu3 return relu3 def create_engine(maxBatchSize, builder, config, dt): weight_map = load_weights(WEIGHT_PATH) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data conv1 = network.add_convolution(input=data, num_output_maps=64, kernel_shape=(7, 7), kernel=weight_map["conv1.weight"], bias=trt.Weights()) assert conv1 conv1.stride = (2, 2) conv1.padding = (3, 3) bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), "bn1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 pool1 = network.add_pooling(input=relu1.get_output(0), window_size=trt.DimsHW(3, 3), type=trt.PoolingType.MAX) assert pool1 pool1.stride = (2, 2) pool1.padding = (1, 1) x = bottleneck(network, weight_map, pool1.get_output(0), 64, 64, 1, "layer1.0.") x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1, "layer1.1.") x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1, "layer1.2.") x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 2, "layer2.0.") x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1, "layer2.1.") x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1, "layer2.2.") x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1, "layer2.3.") x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 2, "layer3.0.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.1.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.2.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.3.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.4.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.5.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 2, "layer4.0.") x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1, "layer4.1.") x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1, "layer4.2.") pool2 = network.add_pooling(x.get_output(0), window_size=trt.DimsHW(7, 7), type=trt.PoolingType.AVERAGE) assert pool2 pool2.stride = (1, 1) fc1 = network.add_fully_connected(input=pool2.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map['fc.weight'], bias=weight_map['fc.bias']) assert fc1 fc1.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(fc1.get_output(0)) # Build engine builder.max_batch_size = maxBatchSize builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network del weight_map return engine def APIToModel(maxBatchSize): builder = trt.Builder(TRT_LOGGER) config = builder.create_builder_config() engine = create_engine(maxBatchSize, builder, config, trt.float32) assert engine with open(ENGINE_PATH, "wb") as f: f.write(engine.serialize()) del engine del builder def doInference(context, host_in, host_out, batchSize): engine = context.engine assert engine.num_bindings == 2 devide_in = cuda.mem_alloc(host_in.nbytes) devide_out = cuda.mem_alloc(host_out.nbytes) bindings = [int(devide_in), int(devide_out)] stream = cuda.Stream() cuda.memcpy_htod_async(devide_in, host_in, stream) context.execute_async(bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_out, devide_out, stream) stream.synchronize() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') args = parser.parse_args() if not (args.s ^ args.d): print( "arguments not right!\n" "python resnet50.py -s # serialize model to plan file\n" "python resnet50.py -d # deserialize plan file and run inference" ) sys.exit() if args.s: APIToModel(BATCH_SIZE) else: runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32) host_in = cuda.pagelocked_empty(BATCH_SIZE * 3 * INPUT_H * INPUT_W, dtype=np.float32) np.copyto(host_in, data.ravel()) host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32) doInference(context, host_in, host_out, BATCH_SIZE) print(f'Output: \n{host_out[:10]}\n{host_out[-10:]}') ================================================ FILE: resnet/resnext50_32x4d.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int groups = 32; int width = outch * 4 / 64 * 32; IConvolutionLayer* conv1 = network->addConvolutionNd(input, width, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), width, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); conv2->setNbGroups(groups); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 4) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../resnext50.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2."); IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool2); pool2->setStrideNd(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./resnext -s // serialize model to plan file" << std::endl; std::cerr << "./resnext -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("resnext50.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("resnext50.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ", "; } std::cout << std::endl; return 0; } ================================================ FILE: resnet/wide_resnet50.py ================================================ import os import sys import struct import argparse import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt BATCH_SIZE = 1 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 1000 BS = 1 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" EPS = 1e-5 WEIGHT_PATH = "./wide_resnet50.wts" ENGINE_PATH = "./wide_resnet50.engine" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), 'Unable to load weight file.' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def addBatchNorm2d(network, weight_map, inputs, layer_name, eps): gamma = weight_map[layer_name + ".weight"] beta = weight_map[layer_name + ".bias"] mean = weight_map[layer_name + ".running_mean"] var = weight_map[layer_name + ".running_var"] print(layer_name + " " + str(len(weight_map[layer_name + ".running_var"]))) var = np.sqrt(var + eps) scale = gamma / var shift = -mean / var * gamma + beta return network.add_scale(input=inputs, mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale) def bottleneck(network, weight_map, input, in_channels, out_channels, stride, layer_name): # empty weights for bias emptywts = trt.Weights() conv1 = network.add_convolution(input=input, num_output_maps=out_channels, kernel_shape=(1, 1), kernel=weight_map[layer_name + "conv1.weight"], bias=emptywts) assert conv1 bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), layer_name + "bn1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 conv2 = network.add_convolution(input=relu1.get_output(0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=weight_map[layer_name + "conv2.weight"], bias=emptywts) assert conv2 conv2.stride = (stride, stride) conv2.padding = (1, 1) bn2 = addBatchNorm2d(network, weight_map, conv2.get_output(0), layer_name + "bn2", EPS) assert bn2 relu2 = network.add_activation(bn2.get_output(0), type=trt.ActivationType.RELU) assert relu2 conv3 = network.add_convolution(input=relu2.get_output(0), num_output_maps=out_channels * 2, kernel_shape=(1, 1), kernel=weight_map[layer_name + "conv3.weight"], bias=emptywts) assert conv3 bn3 = addBatchNorm2d(network, weight_map, conv3.get_output(0), layer_name + "bn3", EPS) assert bn3 if stride != 1 or in_channels != 2 * out_channels: conv4 = network.add_convolution( input=input, num_output_maps=out_channels * 2, kernel_shape=(1, 1), kernel=weight_map[layer_name + "downsample.0.weight"], bias=emptywts) assert conv4 conv4.stride = (stride, stride) bn4 = addBatchNorm2d(network, weight_map, conv4.get_output(0), layer_name + "downsample.1", EPS) assert bn4 ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0), trt.ElementWiseOperation.SUM) else: ew1 = network.add_elementwise(input, bn3.get_output(0), trt.ElementWiseOperation.SUM) assert ew1 relu3 = network.add_activation(ew1.get_output(0), type=trt.ActivationType.RELU) assert relu3 return relu3 def create_engine(maxBatchSize, builder, config, dt): weight_map = load_weights(WEIGHT_PATH) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (3, INPUT_H, INPUT_W)) assert data # empty weights for bias emptywts = trt.Weights() conv1 = network.add_convolution(input=data, num_output_maps=64, kernel_shape=(7, 7), kernel=weight_map["conv1.weight"], bias=emptywts) assert conv1 conv1.stride = (2, 2) conv1.padding = (3, 3) bn1 = addBatchNorm2d(network, weight_map, conv1.get_output(0), "bn1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 pool1 = network.add_pooling(input=relu1.get_output(0), window_size=trt.DimsHW(3, 3), type=trt.PoolingType.MAX) assert pool1 pool1.stride = (2, 2) pool1.padding = (1, 1) x = bottleneck(network, weight_map, pool1.get_output(0), 64, 128, 1, "layer1.0.") x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 1, "layer1.1.") x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 1, "layer1.2.") x = bottleneck(network, weight_map, x.get_output(0), 256, 256, 2, "layer2.0.") x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.1.") x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.2.") x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 1, "layer2.3.") x = bottleneck(network, weight_map, x.get_output(0), 512, 512, 2, "layer3.0.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.1.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.2.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.3.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.4.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 1, "layer3.5.") x = bottleneck(network, weight_map, x.get_output(0), 1024, 1024, 2, "layer4.0.") x = bottleneck(network, weight_map, x.get_output(0), 2048, 1024, 1, "layer4.1.") x = bottleneck(network, weight_map, x.get_output(0), 2048, 1024, 1, "layer4.2.") pool2 = network.add_pooling(x.get_output(0), window_size=trt.DimsHW(7, 7), type=trt.PoolingType.AVERAGE) assert pool2 pool2.stride = (1, 1) fc1 = network.add_fully_connected(input=pool2.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map['fc.weight'], bias=weight_map['fc.bias']) assert fc1 fc1.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(fc1.get_output(0)) # Build engine builder.max_batch_size = maxBatchSize builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) print("build out") del network del weight_map return engine def APIToModel(maxBatchSize): builder = trt.Builder(TRT_LOGGER) config = builder.create_builder_config() engine = create_engine(maxBatchSize, builder, config, trt.float32) assert engine with open(ENGINE_PATH, "wb") as f: f.write(engine.serialize()) del engine del builder def doInference(context, host_in, host_out, batchSize): engine = context.engine assert engine.num_bindings == 2 devide_in = cuda.mem_alloc(host_in.nbytes) devide_out = cuda.mem_alloc(host_out.nbytes) bindings = [int(devide_in), int(devide_out)] stream = cuda.Stream() cuda.memcpy_htod_async(devide_in, host_in, stream) context.execute_async(bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_out, devide_out, stream) stream.synchronize() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-s", action='store_true') parser.add_argument("-d", action='store_true') args = parser.parse_args() if not (args.s ^ args.d): print( "arguments not right!\n" "python wide_resnet50.py -s # serialize model to plan file\n" "python wide_resnet50.py -d # deserialize plan file and run inference" ) sys.exit() if args.s: APIToModel(BATCH_SIZE) else: runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(ENGINE_PATH, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) assert engine context = engine.create_execution_context() assert context data = np.ones((BATCH_SIZE * 3 * INPUT_H * INPUT_W), dtype=np.float32) host_in = cuda.pagelocked_empty(BATCH_SIZE * 3 * INPUT_H * INPUT_W, dtype=np.float32) np.copyto(host_in, data.ravel()) host_out = cuda.pagelocked_empty(OUTPUT_SIZE, dtype=np.float32) doInference(context, host_in, host_out, BATCH_SIZE) print(f'Output: \n{host_out[:10]}\n{host_out[-10:]}') ================================================ FILE: resnet/wideresnet50.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 2, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 2) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 2, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } // Create the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../wideresnet50.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // Add max pooling layer with stride of 2x2 and kernel size of 2x2. IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 128, 1, "layer1.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 1, "layer1.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 1, "layer1.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 256, 2, "layer2.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 1, "layer2.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 1, "layer2.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 1, "layer2.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 512, 2, "layer3.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.4."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 1, "layer3.5."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 1024, 2, "layer4.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 1024, 1, "layer4.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 1024, 1, "layer4.2."); IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool2); pool2->setStrideNd(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./wideresnet -s // serialize model to plan file" << std::endl; std::cerr << "./wideresnet -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("wideresnet50.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("wideresnet50.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 100; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ", "; } std::cout << std::endl; return 0; } ================================================ FILE: retinaface/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(retinaface) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(decodeplugin SHARED ${PROJECT_SOURCE_DIR}/decode.cu) target_link_libraries(decodeplugin nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(retina_r50 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_r50.cpp) target_link_libraries(retina_r50 nvinfer) target_link_libraries(retina_r50 cudart) target_link_libraries(retina_r50 decodeplugin) target_link_libraries(retina_r50 ${OpenCV_LIBRARIES}) add_executable(retina_mnet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_mnet.cpp) target_link_libraries(retina_mnet nvinfer) target_link_libraries(retina_mnet cudart) target_link_libraries(retina_mnet decodeplugin) target_link_libraries(retina_mnet ${OpenCV_LIBRARIES}) add_definitions(-O2 -pthread) ================================================ FILE: retinaface/README.md ================================================ # RetinaFace The pytorch implementation is [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface), I forked it into [wang-xinyu/Pytorch_Retinaface](https://github.com/wang-xinyu/Pytorch_Retinaface) and add genwts.py This branch is using TensorRT 7 API, branch [trt4->retinaface](https://github.com/wang-xinyu/tensorrtx/tree/trt4/retinaface) is using TensorRT 4. ## Config - Input shape `INPUT_H`, `INPUT_W` defined in `decode.h` - INT8/FP16/FP32 can be selected by the macro `USE_FP16` or `USE_INT8` or `USE_FP32` in `retina_r50.cpp` - GPU id can be selected by the macro `DEVICE` in `retina_r50.cpp` - Batchsize can be selected by the macro `BATCHSIZE` in `retina_r50.cpp` ## Run The following described how to run `retina_r50`. While `retina_mnet` is nearly the same, just generate `retinaface.wts` with `mobilenet0.25_Final.pth` and run `retina_mnet`. 1. generate retinaface.wts from pytorch implementation https://github.com/wang-xinyu/Pytorch_Retinaface ``` git clone https://github.com/wang-xinyu/Pytorch_Retinaface.git // download its weights 'Resnet50_Final.pth', put it in Pytorch_Retinaface/weights cd Pytorch_Retinaface python detect.py --save_model python genwts.py // a file 'retinaface.wts' will be generated. ``` 2. put retinaface.wts into tensorrtx/retinaface, build and run ``` git clone https://github.com/wang-xinyu/tensorrtx.git cd tensorrtx/retinaface // put retinaface.wts here mkdir build cd build cmake .. make sudo ./retina_r50 -s // build and serialize model to file i.e. 'retina_r50.engine' wget https://github.com/Tencent/FaceDetection-DSFD/raw/master/data/worlds-largest-selfie.jpg sudo ./retina_r50 -d // deserialize model file and run inference. ``` 3. check the images generated, as follows. 0_result.jpg 4. we also provide a python wrapper ``` // install python-tensorrt, pycuda, etc. // ensure the retina_r50.engine and libdecodeplugin.so have been built python retinaface_trt.py ``` # INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For widerface, you can also download my calibration images `widerface_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in retinaface/build 3. set the macro `USE_INT8` in retina_r50.cpp and make 4. serialize the model and test

## More Information Check the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: retinaface/calibrator.cpp ================================================ #include #include #include #include #include "calibrator.h" #include "cuda_runtime_api.h" #include "common.hpp" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()){ std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0, cv::Size(input_w_, input_h_), cv::Scalar(104, 117, 123), false, false); CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: retinaface/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include "NvInfer.h" #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: retinaface/common.hpp ================================================ #ifndef RETINAFACE_COMMON_H_ #define RETINAFACE_COMMON_H_ #include #include #include "NvInfer.h" #include "decode.h" using namespace nvinfer1; #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols*1.0); float r_h = input_h / (img.rows*1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } static inline cv::Rect get_rect_adapt_landmark(cv::Mat& img, int input_w, int input_h, float bbox[4], float lmk[10]) { int l, r, t, b; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (input_h - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (input_h - r_w * img.rows) / 2) / r_w; for (int i = 0; i < 10; i += 2) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (input_h - r_w * img.rows) / 2) / r_w; } } else { l = (bbox[0] - (input_w - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (input_w - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < 10; i += 2) { lmk[i] = (lmk[i] - (input_w - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; } } return cv::Rect(l, t, r-l, b-t); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0], rbox[0]), //left std::min(lbox[2], rbox[2]), //right std::max(lbox[1], rbox[1]), //top std::min(lbox[3], rbox[3]), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) -interBoxS + 0.000001f); } static bool cmp(const decodeplugin::Detection& a, const decodeplugin::Detection& b) { return a.class_confidence > b.class_confidence; } static inline void nms(std::vector& res, float *output, float nms_thresh = 0.4) { std::vector dets; for (int i = 0; i < output[0]; i++) { if (output[15 * i + 1 + 4] <= 0.1) continue; decodeplugin::Detection det; memcpy(&det, &output[15 * i + 1], sizeof(decodeplugin::Detection)); dets.push_back(det); } std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); //std::cout << item.class_confidence << " bbox " << item.bbox[0] << ", " << item.bbox[1] << ", " << item.bbox[2] << ", " << item.bbox[3] << std::endl; for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } // Load weights from files // TensorRT weight files have a simple space delimited format: // [type] [size] static inline std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } static inline Weights getWeights(std::map& weightMap, std::string key) { if (weightMap.count(key) != 1) { std::cerr << key << " not existed in weight map, fatal error!!!" << std::endl; exit(-1); } return weightMap[key]; } static inline IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } #endif ================================================ FILE: retinaface/decode.cu ================================================ #include "decode.h" #include "stdio.h" namespace nvinfer1 { DecodePlugin::DecodePlugin() { } DecodePlugin::~DecodePlugin() { } // create the plugin at runtime from a byte stream DecodePlugin::DecodePlugin(const void* data, size_t length) { } void DecodePlugin::serialize(void* buffer) const TRT_NOEXCEPT { } size_t DecodePlugin::getSerializationSize() const TRT_NOEXCEPT { return 0; } int DecodePlugin::initialize() TRT_NOEXCEPT { return 0; } Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalCount = 0; totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); return Dims3(totalCount + 1, 1, 1); } // Set plugin namespace void DecodePlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* DecodePlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType DecodePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool DecodePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool DecodePlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void DecodePlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void DecodePlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void DecodePlugin::detachFromContext() TRT_NOEXCEPT {} const char* DecodePlugin::getPluginType() const TRT_NOEXCEPT { return "Decode_TRT"; } const char* DecodePlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void DecodePlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* DecodePlugin::clone() const TRT_NOEXCEPT { DecodePlugin *p = new DecodePlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1./(1. + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor, int output_elem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= num_elem) return; int h = decodeplugin::INPUT_H / step; int w = decodeplugin::INPUT_W / step; int total_grid = h * w; int bn_idx = idx / total_grid; idx = idx - bn_idx * total_grid; int y = idx / w; int x = idx % w; const float* cur_input = input + bn_idx * (4 + 2 + 10) * 2 * total_grid; const float *bbox_reg = &cur_input[0]; const float *cls_reg = &cur_input[2 * 4 * total_grid]; const float *lmk_reg = &cur_input[2 * 4 * total_grid + 2 * 2 * total_grid]; for (int k = 0; k < 2; ++k) { float conf1 = cls_reg[idx + k * total_grid * 2]; float conf2 = cls_reg[idx + k * total_grid * 2 + total_grid]; conf2 = expf(conf2) / (expf(conf1) + expf(conf2)); if (conf2 <= 0.02) continue; float *res_count = output + bn_idx * output_elem; int count = (int)atomicAdd(res_count, 1); char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection); decodeplugin::Detection* det = (decodeplugin::Detection*)(data); float prior[4]; prior[0] = ((float)x + 0.5) / w; prior[1] = ((float)y + 0.5) / h; prior[2] = (float)anchor * (k + 1) / decodeplugin::INPUT_W; prior[3] = (float)anchor * (k + 1) / decodeplugin::INPUT_H; //Location det->bbox[0] = prior[0] + bbox_reg[idx + k * total_grid * 4] * 0.1 * prior[2]; det->bbox[1] = prior[1] + bbox_reg[idx + k * total_grid * 4 + total_grid] * 0.1 * prior[3]; det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 2] * 0.2); det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * total_grid * 4 + total_grid * 3] * 0.2); det->bbox[0] -= det->bbox[2] / 2; det->bbox[1] -= det->bbox[3] / 2; det->bbox[2] += det->bbox[0]; det->bbox[3] += det->bbox[1]; det->bbox[0] *= decodeplugin::INPUT_W; det->bbox[1] *= decodeplugin::INPUT_H; det->bbox[2] *= decodeplugin::INPUT_W; det->bbox[3] *= decodeplugin::INPUT_H; det->class_confidence = conf2; for (int i = 0; i < 10; i += 2) { det->landmark[i] = prior[0] + lmk_reg[idx + k * total_grid * 10 + total_grid * i] * 0.1 * prior[2]; det->landmark[i+1] = prior[1] + lmk_reg[idx + k * total_grid * 10 + total_grid * (i + 1)] * 0.1 * prior[3]; det->landmark[i] *= decodeplugin::INPUT_W; det->landmark[i+1] *= decodeplugin::INPUT_H; } } } void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) { int num_elem = 0; int base_step = 8; int base_anchor = 16; int thread_count; int totalCount = 1; totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); for(int idx = 0 ; idx < batchSize; ++idx) { cudaMemsetAsync(output + idx * totalCount, 0, sizeof(float), stream); } for (unsigned int i = 0; i < 3; ++i) { num_elem = batchSize * decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step; thread_count = (num_elem < thread_count_) ? num_elem : thread_count_; CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count, 0, stream>>> (inputs[i], output, num_elem, base_step, base_anchor, totalCount); base_step *= 2; base_anchor *= 4; } } int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float *)outputs[0], stream, batchSize); return 0; }; PluginFieldCollection DecodePluginCreator::mFC{}; std::vector DecodePluginCreator::mPluginAttributes; DecodePluginCreator::DecodePluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* DecodePluginCreator::getPluginName() const TRT_NOEXCEPT { return "Decode_TRT"; } const char* DecodePluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* DecodePluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* DecodePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { DecodePlugin* obj = new DecodePlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* DecodePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call PReluPlugin::destroy() DecodePlugin* obj = new DecodePlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: retinaface/decode.h ================================================ #ifndef _DECODE_CU_H #define _DECODE_CU_H #include #include #include "NvInfer.h" #include "macros.h" namespace decodeplugin { struct alignas(float) Detection{ float bbox[4]; //x1 y1 x2 y2 float class_confidence; float landmark[10]; }; static const int INPUT_H = 480; static const int INPUT_W = 640; } namespace nvinfer1 { class DecodePlugin: public IPluginV2IOExt { public: DecodePlugin(); DecodePlugin(const void* data, size_t length); ~DecodePlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; int input_size_; private: void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1); int thread_count_ = 256; const char* mPluginNamespace; }; class DecodePluginCreator : public IPluginCreator { public: DecodePluginCreator(); ~DecodePluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(DecodePluginCreator); }; #endif ================================================ FILE: retinaface/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: retinaface/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: retinaface/retina_mnet.cpp ================================================ #include #include #include #include #include #include #include "cuda_runtime_api.h" #include "logging.h" #include "common.hpp" #include "calibrator.h" #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 #define CONF_THRESH 0.75 #define IOU_THRESH 0.4 // stuff we know about the network and the input/output blobs static const int INPUT_H = decodeplugin::INPUT_H; // H, W must be able to be divided by 32. static const int INPUT_W = decodeplugin::INPUT_W;; static const int OUTPUT_SIZE = (INPUT_H / 8 * INPUT_W / 8 + INPUT_H / 16 * INPUT_W / 16 + INPUT_H / 32 * INPUT_W / 32) * 2 * 15 + 1; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; ILayer* conv_bn(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int s = 1, float leaky = 0.1) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{3, 3}, getWeights(weightMap, lname + ".0.weight"), emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(leaky); assert(lr); return lr; } ILayer* conv_bn_no_relu(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int s = 1) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{3, 3}, getWeights(weightMap, lname + ".0.weight"), emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5); return bn1; } ILayer* conv_bn1X1(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup, int s = 1, float leaky = 0.1) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, oup, DimsHW{1, 1}, getWeights(weightMap, lname + ".0.weight"), emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{0, 0}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(leaky); assert(lr); return lr; } ILayer* conv_dw(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int inp, int oup, int s = 1, float leaky = 0.1) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, inp, DimsHW{3, 3}, getWeights(weightMap, lname + ".0.weight"), emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{1, 1}); conv1->setNbGroups(inp); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5); auto lr1 = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr1->setAlpha(leaky); assert(lr1); IConvolutionLayer* conv2 = network->addConvolutionNd(*lr1->getOutput(0), oup, DimsHW{1, 1}, getWeights(weightMap, lname + ".3.weight"), emptywts); assert(conv2); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".4", 1e-5); auto lr2 = network->addActivation(*bn2->getOutput(0), ActivationType::kLEAKY_RELU); lr2->setAlpha(leaky); assert(lr2); return lr2; } IActivationLayer* ssh(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int oup) { auto conv3x3 = conv_bn_no_relu(network, weightMap, input, lname + ".conv3X3", oup / 2); auto conv5x5_1 = conv_bn(network, weightMap, input, lname + ".conv5X5_1", oup / 4); auto conv5x5 = conv_bn_no_relu(network, weightMap, *conv5x5_1->getOutput(0), lname + ".conv5X5_2", oup / 4); auto conv7x7 = conv_bn(network, weightMap, *conv5x5_1->getOutput(0), lname + ".conv7X7_2", oup / 4); conv7x7 = conv_bn_no_relu(network, weightMap, *conv7x7->getOutput(0), lname + ".conv7x7_3", oup / 4); ITensor* inputTensors[] = {conv3x3->getOutput(0), conv5x5->getOutput(0), conv7x7->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 3); IActivationLayer* relu1 = network->addActivation(*cat->getOutput(0), ActivationType::kRELU); assert(relu1); return relu1; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../retinaface.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // ------------- backbone mobilenet0.25 --------------- // stage 1 auto x = conv_bn(network, weightMap, *data, "body.stage1.0", 8, 2); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.1", 8, 16); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.2", 16, 32, 2); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.3", 32, 32); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.4", 32, 64, 2); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage1.5", 64, 64); auto stage1 = x; // stage 2 x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.0", 64, 128, 2); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.1", 128, 128); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.2", 128, 128); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.3", 128, 128); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.4", 128, 128); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage2.5", 128, 128); auto stage2 = x; // stage 3 x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage3.0", 128, 256, 2); x = conv_dw(network, weightMap, *x->getOutput(0), "body.stage3.1", 256, 256); auto stage3 = x; //Dims d1 = stage1->getOutput(0)->getDimensions(); //std::cout << d1.d[0] << " " << d1.d[1] << " " << d1.d[2] << std::endl; // ------------- FPN --------------- auto output1 = conv_bn1X1(network, weightMap, *stage1->getOutput(0), "fpn.output1", 64); auto output2 = conv_bn1X1(network, weightMap, *stage2->getOutput(0), "fpn.output2", 64); auto output3 = conv_bn1X1(network, weightMap, *stage3->getOutput(0), "fpn.output3", 64); float *deval = reinterpret_cast(malloc(sizeof(float) * 64 * 2 * 2)); for (int i = 0; i < 64 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts{DataType::kFLOAT, deval, 64 * 2 * 2}; IDeconvolutionLayer* up3 = network->addDeconvolutionNd(*output3->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts); assert(up3); up3->setStrideNd(DimsHW{2, 2}); up3->setNbGroups(64); weightMap["up3"] = deconvwts; output2 = network->addElementWise(*output2->getOutput(0), *up3->getOutput(0), ElementWiseOperation::kSUM); output2 = conv_bn(network, weightMap, *output2->getOutput(0), "fpn.merge2", 64); IDeconvolutionLayer* up2 = network->addDeconvolutionNd(*output2->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts); assert(up2); up2->setStrideNd(DimsHW{2, 2}); up2->setNbGroups(64); output1 = network->addElementWise(*output1->getOutput(0), *up2->getOutput(0), ElementWiseOperation::kSUM); output1 = conv_bn(network, weightMap, *output1->getOutput(0), "fpn.merge1", 64); // ------------- SSH --------------- auto ssh1 = ssh(network, weightMap, *output1->getOutput(0), "ssh1", 64); auto ssh2 = ssh(network, weightMap, *output2->getOutput(0), "ssh2", 64); auto ssh3 = ssh(network, weightMap, *output3->getOutput(0), "ssh3", 64); //// ------------- Head --------------- auto bbox_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.0.conv1x1.weight"], weightMap["BboxHead.0.conv1x1.bias"]); auto bbox_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.1.conv1x1.weight"], weightMap["BboxHead.1.conv1x1.bias"]); auto bbox_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.2.conv1x1.weight"], weightMap["BboxHead.2.conv1x1.bias"]); auto cls_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.0.conv1x1.weight"], weightMap["ClassHead.0.conv1x1.bias"]); auto cls_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.1.conv1x1.weight"], weightMap["ClassHead.1.conv1x1.bias"]); auto cls_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.2.conv1x1.weight"], weightMap["ClassHead.2.conv1x1.bias"]); auto lmk_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.0.conv1x1.weight"], weightMap["LandmarkHead.0.conv1x1.bias"]); auto lmk_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.1.conv1x1.weight"], weightMap["LandmarkHead.1.conv1x1.bias"]); auto lmk_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.2.conv1x1.weight"], weightMap["LandmarkHead.2.conv1x1.bias"]); //// ------------- Decode bbox, conf, landmark --------------- ITensor* inputTensors1[] = {bbox_head1->getOutput(0), cls_head1->getOutput(0), lmk_head1->getOutput(0)}; auto cat1 = network->addConcatenation(inputTensors1, 3); ITensor* inputTensors2[] = {bbox_head2->getOutput(0), cls_head2->getOutput(0), lmk_head2->getOutput(0)}; auto cat2 = network->addConcatenation(inputTensors2, 3); ITensor* inputTensors3[] = {bbox_head3->getOutput(0), cls_head3->getOutput(0), lmk_head3->getOutput(0)}; auto cat3 = network->addConcatenation(inputTensors3, 3); auto creator = getPluginRegistry()->getPluginCreator("Decode_TRT", "1"); PluginFieldCollection pfc; IPluginV2 *pluginObj = creator->createPlugin("decode", &pfc); ITensor* inputTensors[] = {cat1->getOutput(0), cat2->getOutput(0), cat3->getOutput(0)}; auto decodelayer = network->addPluginV2(inputTensors, 3, *pluginObj); assert(decodelayer); decodelayer->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*decodelayer->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << builder->platformHasFastInt8() << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./widerface_calib/", "mnet_int8calib.table", INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); mem.second.values = NULL; } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); config->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./retina_mnet -s // serialize model to plan file" << std::endl; std::cerr << "./retina_mnet -d // deserialize plan file and run inference" << std::endl; return -1; } cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("retina_mnet.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("retina_mnet.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; cv::Mat img = cv::imread("worlds-largest-selfie.jpg"); cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); //cv::imwrite("preprocessed.jpg", pr_img); // For multi-batch, I feed the same image multiple times. // If you want to process different images in a batch, you need adapt it. for (int b = 0; b < BATCH_SIZE; b++) { float *p_data = &data[b * 3 * INPUT_H * INPUT_W]; for (int i = 0; i < INPUT_H * INPUT_W; i++) { p_data[i] = pr_img.at(i)[0] - 104.0; p_data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] - 117.0; p_data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[2] - 123.0; } } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); //ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); // Run inference static float prob[BATCH_SIZE * OUTPUT_SIZE]; auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "us" << std::endl; for (int b = 0; b < BATCH_SIZE; b++) { std::vector res; nms(res, &prob[b * OUTPUT_SIZE], IOU_THRESH); std::cout << "number of detections -> " << prob[b * OUTPUT_SIZE] << std::endl; std::cout << " -> " << prob[b * OUTPUT_SIZE + 10] << std::endl; std::cout << "after nms -> " << res.size() << std::endl; cv::Mat tmp = img.clone(); for (size_t j = 0; j < res.size(); j++) { if (res[j].class_confidence < CONF_THRESH) continue; cv::Rect r = get_rect_adapt_landmark(tmp, INPUT_W, INPUT_H, res[j].bbox, res[j].landmark); cv::rectangle(tmp, r, cv::Scalar(0x27, 0xC1, 0x36), 2); //cv::putText(tmp, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1); for (int k = 0; k < 10; k += 2) { cv::circle(tmp, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4); } } cv::imwrite(std::to_string(b) + "_result.jpg", tmp); } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: retinaface/retina_r50.cpp ================================================ #include #include #include #include #include #include #include "cuda_runtime_api.h" #include "logging.h" #include "common.hpp" #include "calibrator.h" #define USE_INT8 // set USE_INT8 or USE_FP16 or USE_FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 #define CONF_THRESH 0.75 #define IOU_THRESH 0.4 // stuff we know about the network and the input/output blobs static const int INPUT_H = decodeplugin::INPUT_H; // H, W must be able to be divided by 32. static const int INPUT_W = decodeplugin::INPUT_W;; static const int OUTPUT_SIZE = (INPUT_H / 8 * INPUT_W / 8 + INPUT_H / 16 * INPUT_W / 16 + INPUT_H / 32 * INPUT_W / 32) * 2 * 15 + 1; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; IActivationLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 4) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } ILayer* conv_bn_relu(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int kernelsize, int stride, int padding, bool userelu, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{kernelsize, kernelsize}, getWeights(weightMap, lname + ".0.weight"), emptywts); assert(conv1); conv1->setStrideNd(DimsHW{stride, stride}); conv1->setPaddingNd(DimsHW{padding, padding}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".1", 1e-5); if (!userelu) return bn1; IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); return relu1; } IActivationLayer* ssh(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { auto conv3x3 = conv_bn_relu(network, weightMap, input, 256 / 2, 3, 1, 1, false, lname + ".conv3X3"); auto conv5x5_1 = conv_bn_relu(network, weightMap, input, 256 / 4, 3, 1, 1, true, lname + ".conv5X5_1"); auto conv5x5 = conv_bn_relu(network, weightMap, *conv5x5_1->getOutput(0), 256 / 4, 3, 1, 1, false, lname + ".conv5X5_2"); auto conv7x7 = conv_bn_relu(network, weightMap, *conv5x5_1->getOutput(0), 256 / 4, 3, 1, 1, true, lname + ".conv7X7_2"); conv7x7 = conv_bn_relu(network, weightMap, *conv7x7->getOutput(0), 256 / 4, 3, 1, 1, false, lname + ".conv7x7_3"); ITensor* inputTensors[] = {conv3x3->getOutput(0), conv5x5->getOutput(0), conv7x7->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 3); IActivationLayer* relu1 = network->addActivation(*cat->getOutput(0), ActivationType::kRELU); assert(relu1); return relu1; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../retinaface.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // ------------- backbone resnet50 --------------- IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["body.conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "body.bn1", 1e-5); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // Add max pooling layer with stride of 2x2 and kernel size of 2x2. IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "body.layer1.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "body.layer1.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "body.layer1.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "body.layer2.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "body.layer2.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "body.layer2.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "body.layer2.3."); IActivationLayer* layer2 = x; x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "body.layer3.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.2."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.3."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.4."); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "body.layer3.5."); IActivationLayer* layer3 = x; x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "body.layer4.0."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "body.layer4.1."); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "body.layer4.2."); IActivationLayer* layer4 = x; // ------------- FPN --------------- auto output1 = conv_bn_relu(network, weightMap, *layer2->getOutput(0), 256, 1, 1, 0, true, "fpn.output1"); auto output2 = conv_bn_relu(network, weightMap, *layer3->getOutput(0), 256, 1, 1, 0, true, "fpn.output2"); auto output3 = conv_bn_relu(network, weightMap, *layer4->getOutput(0), 256, 1, 1, 0, true, "fpn.output3"); float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); for (int i = 0; i < 256 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts{DataType::kFLOAT, deval, 256 * 2 * 2}; IDeconvolutionLayer* up3 = network->addDeconvolutionNd(*output3->getOutput(0), 256, DimsHW{2, 2}, deconvwts, emptywts); assert(up3); up3->setStrideNd(DimsHW{2, 2}); up3->setNbGroups(256); weightMap["up3"] = deconvwts; output2 = network->addElementWise(*output2->getOutput(0), *up3->getOutput(0), ElementWiseOperation::kSUM); output2 = conv_bn_relu(network, weightMap, *output2->getOutput(0), 256, 3, 1, 1, true, "fpn.merge2"); IDeconvolutionLayer* up2 = network->addDeconvolutionNd(*output2->getOutput(0), 256, DimsHW{2, 2}, deconvwts, emptywts); assert(up2); up2->setStrideNd(DimsHW{2, 2}); up2->setNbGroups(256); output1 = network->addElementWise(*output1->getOutput(0), *up2->getOutput(0), ElementWiseOperation::kSUM); output1 = conv_bn_relu(network, weightMap, *output1->getOutput(0), 256, 3, 1, 1, true, "fpn.merge1"); // ------------- SSH --------------- auto ssh1 = ssh(network, weightMap, *output1->getOutput(0), "ssh1"); auto ssh2 = ssh(network, weightMap, *output2->getOutput(0), "ssh2"); auto ssh3 = ssh(network, weightMap, *output3->getOutput(0), "ssh3"); // ------------- Head --------------- auto bbox_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.0.conv1x1.weight"], weightMap["BboxHead.0.conv1x1.bias"]); auto bbox_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.1.conv1x1.weight"], weightMap["BboxHead.1.conv1x1.bias"]); auto bbox_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 4, DimsHW{1, 1}, weightMap["BboxHead.2.conv1x1.weight"], weightMap["BboxHead.2.conv1x1.bias"]); auto cls_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.0.conv1x1.weight"], weightMap["ClassHead.0.conv1x1.bias"]); auto cls_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.1.conv1x1.weight"], weightMap["ClassHead.1.conv1x1.bias"]); auto cls_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 2, DimsHW{1, 1}, weightMap["ClassHead.2.conv1x1.weight"], weightMap["ClassHead.2.conv1x1.bias"]); auto lmk_head1 = network->addConvolutionNd(*ssh1->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.0.conv1x1.weight"], weightMap["LandmarkHead.0.conv1x1.bias"]); auto lmk_head2 = network->addConvolutionNd(*ssh2->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.1.conv1x1.weight"], weightMap["LandmarkHead.1.conv1x1.bias"]); auto lmk_head3 = network->addConvolutionNd(*ssh3->getOutput(0), 2 * 10, DimsHW{1, 1}, weightMap["LandmarkHead.2.conv1x1.weight"], weightMap["LandmarkHead.2.conv1x1.bias"]); // ------------- Decode bbox, conf, landmark --------------- ITensor* inputTensors1[] = {bbox_head1->getOutput(0), cls_head1->getOutput(0), lmk_head1->getOutput(0)}; auto cat1 = network->addConcatenation(inputTensors1, 3); ITensor* inputTensors2[] = {bbox_head2->getOutput(0), cls_head2->getOutput(0), lmk_head2->getOutput(0)}; auto cat2 = network->addConcatenation(inputTensors2, 3); ITensor* inputTensors3[] = {bbox_head3->getOutput(0), cls_head3->getOutput(0), lmk_head3->getOutput(0)}; auto cat3 = network->addConcatenation(inputTensors3, 3); auto creator = getPluginRegistry()->getPluginCreator("Decode_TRT", "1"); PluginFieldCollection pfc; IPluginV2 *pluginObj = creator->createPlugin("decode", &pfc); ITensor* inputTensors[] = {cat1->getOutput(0), cat2->getOutput(0), cat3->getOutput(0)}; auto decodelayer = network->addPluginV2(inputTensors, 3, *pluginObj); assert(decodelayer); decodelayer->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*decodelayer->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << builder->platformHasFastInt8() << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./widerface_calib/", "r50_int8calib.table", INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); mem.second.values = NULL; } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); config->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./retina_r50 -s // serialize model to plan file" << std::endl; std::cerr << "./retina_r50 -d // deserialize plan file and run inference" << std::endl; return -1; } cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("retina_r50.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("retina_r50.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; cv::Mat img = cv::imread("worlds-largest-selfie.jpg"); cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); //cv::imwrite("preprocessed.jpg", pr_img); // For multi-batch, I feed the same image multiple times. // If you want to process different images in a batch, you need adapt it. for (int b = 0; b < BATCH_SIZE; b++) { float *p_data = &data[b * 3 * INPUT_H * INPUT_W]; for (int i = 0; i < INPUT_H * INPUT_W; i++) { p_data[i] = pr_img.at(i)[0] - 104.0; p_data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] - 117.0; p_data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[2] - 123.0; } } IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); //ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); // Run inference static float prob[BATCH_SIZE * OUTPUT_SIZE]; for (int cc = 0; cc < 1000; cc++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "us" << std::endl; } for (int b = 0; b < BATCH_SIZE; b++) { std::vector res; nms(res, &prob[b * OUTPUT_SIZE], IOU_THRESH); std::cout << "number of detections -> " << prob[b * OUTPUT_SIZE] << std::endl; std::cout << " -> " << prob[b * OUTPUT_SIZE + 10] << std::endl; std::cout << "after nms -> " << res.size() << std::endl; cv::Mat tmp = img.clone(); for (size_t j = 0; j < res.size(); j++) { if (res[j].class_confidence < CONF_THRESH) continue; cv::Rect r = get_rect_adapt_landmark(tmp, INPUT_W, INPUT_H, res[j].bbox, res[j].landmark); cv::rectangle(tmp, r, cv::Scalar(0x27, 0xC1, 0x36), 2); //cv::putText(tmp, std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1); for (int k = 0; k < 10; k += 2) { cv::circle(tmp, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4); } } cv::imwrite(std::to_string(b) + "_result.jpg", tmp); } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: retinaface/retinaface_trt.py ================================================ """ Use TensorRT's Python api to make inferences. """ # -*- coding: utf-8 -* import ctypes import os import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt import torch import torchvision INPUT_H = 480 #defined in decode.h INPUT_W = 640 CONF_THRESH = 0.75 IOU_THRESHOLD = 0.4 np.set_printoptions(threshold=np.inf) def plot_one_box(x, landmark,img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.001 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) cv2.circle(img, (int(landmark[0]), int(landmark[1])), 1, (0, 0, 255), 4) cv2.circle(img, (int(landmark[2]), int(landmark[3])), 1, (0, 255, 255), 4) cv2.circle(img, (int(landmark[4]), int(landmark[5])), 1, (255, 0, 255), 4) cv2.circle(img, (int(landmark[6]), int(landmark[7])), 1, (0, 255, 0), 4) cv2.circle(img, (int(landmark[8]), int(landmark[9])), 1, (255, 0, 0), 4) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class Retinaface_trt(object): """ description: A Retineface class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.cfx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings def infer(self, input_image_path): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.cfx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess input_image, image_raw, origin_h, origin_w = self.preprocess_image( input_image_path ) a = time.time() # Copy input image to host buffer np.copyto(host_inputs[0], input_image.ravel()) # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() # Remove any context from the top of the context stack, deactivating it. self.cfx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess result_boxes, result_scores, result_landmark = self.post_process( output, origin_h, origin_w ) b = time.time()-a print(b) # Draw rectangles and labels on the original image # Save image for i in range(len(result_boxes)): box = result_boxes[i] landmark = result_landmark[i] plot_one_box( box, landmark, image_raw, label="{}:{:.2f}".format( 'Face', result_scores[i])) parent, filename = os.path.split(input_image_path) save_name = os.path.join(parent, "output_" + filename) cv2.imwrite(save_name, image_raw) def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.cfx.pop() def preprocess_image(self, input_image_path): """ description: Read an image from image path, resize and pad it to target size, normalize to [0,1],transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = cv2.imread(input_image_path) h, w, c = image_raw.shape # Calculate widht and height and paddings r_w = INPUT_W / w r_h = INPUT_H / h if r_h > r_w: tw = INPUT_W th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((INPUT_H - th) / 2) ty2 = INPUT_H - th - ty1 else: tw = int(r_h * w) th = INPUT_H tx1 = int((INPUT_W - tw) / 2) tx2 = INPUT_W - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image_raw, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128) ) image = image.astype(np.float32) # HWC to CHW format: image -= (104, 117, 123) image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x,landmark): y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x) r_w = INPUT_W / origin_w r_h = INPUT_H / origin_h if r_h > r_w: y[:, 0] = x[:, 0] / r_w y[:, 2] = x[:, 2] / r_w y[:, 1] = (x[:, 1] - (INPUT_H - r_w * origin_h) / 2) / r_w y[:, 3] = (x[:, 3] - (INPUT_H - r_w * origin_h) / 2) / r_w landmark[:,0] = landmark[:,0]/r_w landmark[:,1] = (landmark[:,1] - (INPUT_H - r_w * origin_h) / 2)/r_w landmark[:,2] = landmark[:,2]/r_w landmark[:,3] = (landmark[:,3] - (INPUT_H - r_w * origin_h) / 2)/r_w landmark[:,4] = landmark[:,4]/r_w landmark[:,5] = (landmark[:,5] - (INPUT_H - r_w * origin_h) / 2)/r_w landmark[:,6] = landmark[:,6]/r_w landmark[:,7] = (landmark[:,7] - (INPUT_H - r_w * origin_h) / 2)/r_w landmark[:,8] = landmark[:,8]/r_w landmark[:,9] = (landmark[:,9] - (INPUT_H - r_w * origin_h) / 2)/r_w else: y[:, 0] = (x[:, 0] - (INPUT_W - r_h * origin_w) / 2) / r_h y[:, 2] = (x[:, 2] - (INPUT_W - r_h * origin_w) / 2) / r_h y[:, 1] = x[:, 1] /r_h y[:, 3] = x[:, 3] /r_h landmark[:,0] = (landmark[:,0] - (INPUT_W - r_h * origin_w) / 2)/r_h landmark[:,1] = landmark[:,1]/ r_h landmark[:,2] = (landmark[:,2] - (INPUT_W - r_h * origin_w) / 2)/r_h landmark[:,3] = landmark[:,3]/ r_h landmark[:,4] = (landmark[:,4] - (INPUT_W - r_h * origin_w) / 2)/r_h landmark[:,5] = landmark[:,5]/ r_h landmark[:,6] = (landmark[:,6] - (INPUT_W - r_h * origin_w) / 2)/r_h landmark[:,7] = landmark[:,7]/ r_h landmark[:,8] = (landmark[:,8] - (INPUT_W - r_h * origin_w) / 2)/r_h landmark[:,9] = landmark[:,9]/ r_h return y, landmark def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A tensor likes [num_boxes,x1,y1,x2,y2,conf,landmark_x1,landmark_y1, landmark_x2,landmark_y2,...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a tensor, each element is the score correspoing to box result_classid: finally classid, a tensor, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 15))[:num, :] # to torch Tensor pred = torch.Tensor(pred).cuda() # Get the boxes boxes = pred[:, :4] # Get the scores scores = pred[:, 4] # Get the landmark landmark = pred[:,5:15] # Choose those boxes that score > CONF_THRESH si = scores > CONF_THRESH boxes = boxes[si, :] scores = scores[si] landmark = landmark[si,:] # Get boxes and landmark boxes,landmark = self.xywh2xyxy(origin_h, origin_w, boxes,landmark) # Do nms indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu() result_boxes = boxes[indices, :].cpu() result_scores = scores[indices].cpu() result_landmark = landmark[indices].cpu() return result_boxes, result_scores, result_landmark class myThread(threading.Thread): def __init__(self, func, args): threading.Thread.__init__(self) self.func = func self.args = args def run(self): self.func(*self.args) if __name__ == "__main__": # load custom plugins,make sure it has been generated PLUGIN_LIBRARY = "build/libdecodeplugin.so" ctypes.CDLL(PLUGIN_LIBRARY) engine_file_path = "build/retina_r50.engine" retinaface = Retinaface_trt(engine_file_path) input_image_paths = ["zidane.jpg"] for i in range(10): for input_image_path in input_image_paths: # create a new thread to do inference thread = myThread(retinaface.infer, [input_image_path]) thread.start() thread.join() # destroy the instance retinaface.destroy() ================================================ FILE: retinafaceAntiCov/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(retinafaceAntiCov) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/home/lindsay/TensorRT-8.6.1.6/include) link_directories(/home/lindsay/TensorRT-8.6.1.6/lib) # include_directories(/home/lindsay/TensorRT-7.2.3.4/include) # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/decode.cu) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(retinafaceAntiCov ${PROJECT_SOURCE_DIR}/retinafaceAntiCov.cpp) target_link_libraries(retinafaceAntiCov nvinfer) target_link_libraries(retinafaceAntiCov cudart) target_link_libraries(retinafaceAntiCov myplugins) target_link_libraries(retinafaceAntiCov ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: retinafaceAntiCov/README.md ================================================ # RetinaFaceAntiCov The mxnet implementation is [deepinsight/insightface/RetinaFaceAntiCov](https://github.com/deepinsight/insightface/tree/master/RetinaFaceAntiCov). ## Run ``` 1. generate retinafaceAntiCov.wts from mxnet implementation. git clone https://github.com/deepinsight/insightface.git cd insightface/RetinaFaceAntiCov // download its weights 'cov2.zip', put it into insightface/RetinaFaceAntiCov, and unzip it // put tensorrtx/retinafaceAntiCov/gen_wts.py into insightface/RetinaFaceAntiCov python gen_wts.py // a file 'retinafaceAntiCov.wts' will be generated. 2. put retinafaceAntiCov.wts into tensorrtx/retinafaceAntiCov, build and run git clone https://github.com/wang-xinyu/tensorrtx.git cd tensorrtx/retinafaceAntiCov // put retinafaceAntiCov.wts here mkdir build cd build cmake .. make sudo ./retinafaceAntiCov -s // build and serialize model to file i.e. 'retinafaceAntiCov.engine' wget http://www.kaixian.tv/gd/d/file/201611/07/23efff3a26e2385620e719378c654fb1.jpg -O test.jpg sudo ./retinafaceAntiCov -d // deserialize model file and run inference. 3. check the image generated, as follows 'out.jpg' ```

## Config - Input shape `INPUT_H`, `INPUT_W` defined in `decode.h` - FP16/FP32 can be selected by the macro `USE_FP16` in `retinafaceAntiCov.cpp` - GPU id can be selected by the macro `DEVICE` in `retinafaceAntiCov.cpp` ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: retinafaceAntiCov/decode.cu ================================================ #include "decode.h" #include "stdio.h" namespace nvinfer1 { DecodePlugin::DecodePlugin() { } DecodePlugin::~DecodePlugin() { } // create the plugin at runtime from a byte stream DecodePlugin::DecodePlugin(const void* data, size_t length) { } void DecodePlugin::serialize(void* buffer) const TRT_NOEXCEPT { } size_t DecodePlugin::getSerializationSize() const TRT_NOEXCEPT { return 0; } int DecodePlugin::initialize() TRT_NOEXCEPT { return 0; } Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalCount = 0; totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float); return Dims3(totalCount + 1, 1, 1); } // Set plugin namespace void DecodePlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* DecodePlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType DecodePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool DecodePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool DecodePlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void DecodePlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void DecodePlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void DecodePlugin::detachFromContext() TRT_NOEXCEPT {} const char* DecodePlugin::getPluginType() const TRT_NOEXCEPT { return "Decode_TRT"; } const char* DecodePlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void DecodePlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* DecodePlugin::clone() const TRT_NOEXCEPT { DecodePlugin *p = new DecodePlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1./(1. + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= num_elem) return; int h = decodeplugin::INPUT_H / step; int w = decodeplugin::INPUT_W / step; int y = idx / w; int x = idx % w; const float *cls_reg = &input[2 * num_elem]; const float *bbox_reg = &input[4 * num_elem]; const float *lmk_reg = &input[12 * num_elem]; const float *mask_reg = &input[36 * num_elem]; for (int k = 0; k < 2; ++k) { float conf = cls_reg[idx + k * num_elem]; if (conf < 0.5) continue; float *res_count = output; int count = (int)atomicAdd(res_count, 1); char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection); decodeplugin::Detection* det = (decodeplugin::Detection*)(data); float prior[4]; prior[0] = 7.5 + (float)(x * step); prior[1] = 7.5 + (float)(y * step); prior[2] = anchor * 2 / (k + 1); prior[3] = prior[2]; //Location det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * prior[2]; det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * prior[3]; det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 2]); det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 3]); det->bbox[0] -= (det->bbox[2] - 1) / 2; det->bbox[1] -= (det->bbox[3] - 1) / 2; det->bbox[2] += det->bbox[0]; det->bbox[3] += det->bbox[1]; det->class_confidence = conf; for (int i = 0; i < 10; i += 2) { det->landmark[i] = prior[0] + lmk_reg[idx + k * num_elem * 10 + num_elem * i] * 0.2 * prior[2]; det->landmark[i+1] = prior[1] + lmk_reg[idx + k * num_elem * 10 + num_elem * (i + 1)] * 0.2 * prior[3]; } det->mask_confidence = mask_reg[idx + k * num_elem];; } } void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) { int num_elem = 0; int base_step = 8; int base_anchor = 16; int thread_count; cudaMemset(output, 0, sizeof(float)); for (unsigned int i = 0; i < 3; ++i) { num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step; thread_count = (num_elem < thread_count_) ? num_elem : thread_count_; CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>> (inputs[i], output, num_elem, base_step, base_anchor); base_step *= 2; base_anchor *= 4; } } int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize); return 0; }; PluginFieldCollection DecodePluginCreator::mFC{}; std::vector DecodePluginCreator::mPluginAttributes; DecodePluginCreator::DecodePluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* DecodePluginCreator::getPluginName() const TRT_NOEXCEPT { return "Decode_TRT"; } const char* DecodePluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* DecodePluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* DecodePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { DecodePlugin* obj = new DecodePlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* DecodePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call PReluPlugin::destroy() DecodePlugin* obj = new DecodePlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: retinafaceAntiCov/decode.h ================================================ #ifndef _DECODE_CU_H #define _DECODE_CU_H #include #include #include #include "NvInfer.h" #include "macros.h" namespace decodeplugin { struct alignas(float) Detection{ float bbox[4]; //x1 y1 x2 y2 float class_confidence; float landmark[10]; float mask_confidence; }; static const int INPUT_H = 640; static const int INPUT_W = 640; // std::ostream& operator << (std::ostream& os, const decodeplugin::Detection& det) { // for(int i = 0; i < 10; i += 2){ // os << det.mask_confidence << " "; // } // return os; // } } namespace nvinfer1 { class DecodePlugin: public IPluginV2IOExt { public: DecodePlugin(); DecodePlugin(const void* data, size_t length); ~DecodePlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; int input_size_; private: void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1); int thread_count_ = 256; const char* mPluginNamespace; }; class DecodePluginCreator : public IPluginCreator { public: DecodePluginCreator(); ~DecodePluginCreator() TRT_NOEXCEPT override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; }; #endif ================================================ FILE: retinafaceAntiCov/gen_wts.py ================================================ import struct from retinaface_cov import RetinaFaceCoV gpuid = 0 model = RetinaFaceCoV('./cov2/mnet_cov2', 0, gpuid, 'net3l') f = open('retinafaceAntiCov.wts', 'w') f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys()))) for k, v in model.model.get_params()[0].items(): vr = v.reshape(-1).asnumpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') for k, v in model.model.get_params()[1].items(): vr = v.reshape(-1).asnumpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') ================================================ FILE: retinafaceAntiCov/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: retinafaceAntiCov/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: retinafaceAntiCov/retinafaceAntiCov.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include "decode.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) //#define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // currently, only support BATCH=1 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = decodeplugin::INPUT_H; static const int INPUT_W = decodeplugin::INPUT_W; static const int DETECTION_SIZE = sizeof(decodeplugin::Detection) / sizeof(float); static const int OUTPUT_SIZE = (INPUT_H / 8 * INPUT_W / 8 + INPUT_H / 16 * INPUT_W / 16 + INPUT_H / 32 * INPUT_W / 32) * 2 * DETECTION_SIZE + 1; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; REGISTER_TENSORRT_PLUGIN(DecodePluginCreator); cv::Mat preprocess_img(cv::Mat& img) { int w, h, x, y; float r_w = INPUT_W / (img.cols*1.0); float r_h = INPUT_H / (img.rows*1.0); if (r_h > r_w) { w = INPUT_W; h = r_w * img.rows; x = 0; y = (INPUT_H - h) / 2; } else { w = r_h* img.cols; h = INPUT_H; x = (INPUT_W - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[10]) { int l, r, t, b; float r_w = INPUT_W / (img.cols * 1.0); float r_h = INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (INPUT_H - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (INPUT_H - r_w * img.rows) / 2) / r_w; for (int i = 0; i < 10; i += 2) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (INPUT_H - r_w * img.rows) / 2) / r_w; } } else { l = (bbox[0] - (INPUT_W - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (INPUT_W - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < 10; i += 2) { lmk[i] = (lmk[i] - (INPUT_W - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; } } return cv::Rect(l, t, r-l, b-t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0], rbox[0]), //left std::min(lbox[2], rbox[2]), //right std::max(lbox[1], rbox[1]), //top std::min(lbox[3], rbox[3]), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); return interBoxS / ((lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) -interBoxS + 0.000001f); } bool cmp(decodeplugin::Detection& a, decodeplugin::Detection& b) { return a.class_confidence > b.class_confidence; } void nms(std::vector& res, float *output, float nms_thresh = 0.4) { std::vector dets; for (int i = 0; i < output[0]; i++) { if (output[DETECTION_SIZE * i + 1 + 4] <= 0.1) continue; decodeplugin::Detection det; memcpy(&det, &output[DETECTION_SIZE * i + 1], sizeof(decodeplugin::Detection)); dets.push_back(det); } std::sort(dets.begin(), dets.end(), cmp); if (dets.size() > 5000) dets.erase(dets.begin() + 5000, dets.end()); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); //std::cout << item.class_confidence << " bbox " << item.bbox[0] << ", " << item.bbox[1] << ", " << item.bbox[2] << ", " << item.bbox[3] << std::endl; for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + "_gamma"].values; float *beta = (float*)weightMap[lname + "_beta"].values; float *mean = (float*)weightMap[lname + "_moving_mean"].values; float *var = (float*)weightMap[lname + "_moving_var"].values; int len = weightMap[lname + "_moving_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, int num_filters, int k, int s, int p, int g, std::string lname) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv = network->addConvolutionNd(input, num_filters, DimsHW{k, k}, weightMap[lname + "_conv2d_weight"], emptywts); assert(conv); conv->setStrideNd(DimsHW{s, s}); conv->setPaddingNd(DimsHW{p, p}); conv->setNbGroups(g); auto bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + "_batchnorm", 1e-3); IActivationLayer* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } ILayer* convBiasBnRelu(INetworkDefinition *network, std::map& weightMap, ITensor& input, int num_filters, int k, int s, int p, std::string lname) { IConvolutionLayer* conv = network->addConvolutionNd(input, num_filters, DimsHW{k, k}, weightMap[lname + "_weight"], weightMap[lname + "_bias"]); assert(conv); conv->setStrideNd(DimsHW{s, s}); conv->setPaddingNd(DimsHW{p, p}); auto bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + "_bn", 2e-5); IActivationLayer* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } ILayer* head(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname) { auto conv1 = network->addConvolutionNd(input, 32, DimsHW{3, 3}, weightMap[lname + "_conv1_weight"], weightMap[lname + "_conv1_bias"]); assert(conv1); conv1->setPaddingNd(DimsHW{1, 1}); auto conv1bn = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "_conv1_bn", 2e-5); auto ctxconv1 = convBiasBnRelu(network, weightMap, input, 16, 3, 1, 1, lname + "_context_conv1"); auto ctxconv2 = network->addConvolutionNd(*ctxconv1->getOutput(0), 16, DimsHW{3, 3}, weightMap[lname + "_context_conv2_weight"], weightMap[lname + "_context_conv2_bias"]); assert(ctxconv2); ctxconv2->setPaddingNd(DimsHW{1, 1}); auto ctxconv2bn = addBatchNorm2d(network, weightMap, *ctxconv2->getOutput(0), lname + "_context_conv2_bn", 2e-5); auto ctxconv3_1 = convBiasBnRelu(network, weightMap, *ctxconv1->getOutput(0), 16, 3, 1, 1, lname + "_context_conv3_1"); auto ctxconv3_2 = network->addConvolutionNd(*ctxconv3_1->getOutput(0), 16, DimsHW{3, 3}, weightMap[lname + "_context_conv3_2_weight"], weightMap[lname + "_context_conv3_2_bias"]); assert(ctxconv3_2); ctxconv3_2->setPaddingNd(DimsHW{1, 1}); auto ctxconv3_2bn = addBatchNorm2d(network, weightMap, *ctxconv3_2->getOutput(0), lname + "_context_conv3_2_bn", 2e-5); ITensor* inputTensors[] = {conv1bn->getOutput(0), ctxconv2bn->getOutput(0), ctxconv3_2bn->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 3); assert(cat); IActivationLayer* relu = network->addActivation(*cat->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } ILayer* reshapeSoftmax(INetworkDefinition *network, ITensor& input, int c) { auto re1 = network->addShuffle(input); assert(re1); re1->setReshapeDimensions(Dims3(c / 2, -1, 0)); auto sm = network->addSoftMax(*re1->getOutput(0)); assert(sm); auto re2 = network->addShuffle(*sm->getOutput(0)); assert(re2); re2->setReshapeDimensions(Dims3(c, -1, 0)); return re2; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../retinafaceAntiCov.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto conv1 = convBnRelu(network, weightMap, *data, 16, 3, 2, 1, 1, "conv_1"); auto conv2 = convBnRelu(network, weightMap, *conv1->getOutput(0), 32, 1, 1, 0, 1, "conv_2"); auto conv3dw = convBnRelu(network, weightMap, *conv2->getOutput(0), 32, 3, 2, 1, 32, "conv_3_dw"); auto conv3 = convBnRelu(network, weightMap, *conv3dw->getOutput(0), 32, 1, 1, 0, 1, "conv_3"); auto conv4dw = convBnRelu(network, weightMap, *conv3->getOutput(0), 32, 3, 1, 1, 32, "conv_4_dw"); auto conv4 = convBnRelu(network, weightMap, *conv4dw->getOutput(0), 32, 1, 1, 0, 1, "conv_4"); auto conv5dw = convBnRelu(network, weightMap, *conv4->getOutput(0), 32, 3, 2, 1, 32, "conv_5_dw"); auto conv5 = convBnRelu(network, weightMap, *conv5dw->getOutput(0), 64, 1, 1, 0, 1, "conv_5"); auto conv6dw = convBnRelu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, 64, "conv_6_dw"); auto conv6 = convBnRelu(network, weightMap, *conv6dw->getOutput(0), 64, 1, 1, 0, 1, "conv_6"); // conv6 to c1 auto conv7dw = convBnRelu(network, weightMap, *conv6->getOutput(0), 64, 3, 2, 1, 64, "conv_7_dw"); auto conv7 = convBnRelu(network, weightMap, *conv7dw->getOutput(0), 128, 1, 1, 0, 1, "conv_7"); auto conv8dw = convBnRelu(network, weightMap, *conv7->getOutput(0), 128, 3, 1, 1, 128, "conv_8_dw"); auto conv8 = convBnRelu(network, weightMap, *conv8dw->getOutput(0), 128, 1, 1, 0, 1, "conv_8"); auto conv9dw = convBnRelu(network, weightMap, *conv8->getOutput(0), 128, 3, 1, 1, 128, "conv_9_dw"); auto conv9 = convBnRelu(network, weightMap, *conv9dw->getOutput(0), 128, 1, 1, 0, 1, "conv_9"); auto conv10dw = convBnRelu(network, weightMap, *conv9->getOutput(0), 128, 3, 1, 1, 128, "conv_10_dw"); auto conv10 = convBnRelu(network, weightMap, *conv10dw->getOutput(0), 128, 1, 1, 0, 1, "conv_10"); auto conv11dw = convBnRelu(network, weightMap, *conv10->getOutput(0), 128, 3, 1, 1, 128, "conv_11_dw"); auto conv11 = convBnRelu(network, weightMap, *conv11dw->getOutput(0), 128, 1, 1, 0, 1, "conv_11"); auto conv12dw = convBnRelu(network, weightMap, *conv11->getOutput(0), 128, 3, 1, 1, 128, "conv_12_dw"); auto conv12 = convBnRelu(network, weightMap, *conv12dw->getOutput(0), 128, 1, 1, 0, 1, "conv_12"); // conv12 to c2 auto conv13dw = convBnRelu(network, weightMap, *conv12->getOutput(0), 128, 3, 2, 1, 128, "conv_13_dw"); auto conv13 = convBnRelu(network, weightMap, *conv13dw->getOutput(0), 256, 1, 1, 0, 1, "conv_13"); auto conv14dw = convBnRelu(network, weightMap, *conv13->getOutput(0), 256, 3, 1, 1, 256, "conv_14_dw"); auto conv14 = convBnRelu(network, weightMap, *conv14dw->getOutput(0), 256, 1, 1, 0, 1, "conv_14"); auto conv_final = convBnRelu(network, weightMap, *conv14->getOutput(0), 256, 1, 1, 0, 1, "conv_final"); // convfinal to c3 auto rf_c3_lateral = convBiasBnRelu(network, weightMap, *conv_final->getOutput(0), 64, 1, 1, 0, "rf_c3_lateral"); auto rf_head_s32 = head(network, weightMap, *rf_c3_lateral->getOutput(0), "rf_head_stride32"); ILayer *cls_score_s32 = network->addConvolutionNd(*rf_head_s32->getOutput(0), 4, DimsHW{1, 1}, weightMap["face_rpn_cls_score_stride32_weight"], weightMap["face_rpn_cls_score_stride32_bias"]); cls_score_s32 = reshapeSoftmax(network, *cls_score_s32->getOutput(0), 4); auto bbox_s32 = network->addConvolutionNd(*rf_head_s32->getOutput(0), 8, DimsHW{1, 1}, weightMap["face_rpn_bbox_pred_stride32_weight"], weightMap["face_rpn_bbox_pred_stride32_bias"]); auto landmark_s32 = network->addConvolutionNd(*rf_head_s32->getOutput(0), 20, DimsHW{1, 1}, weightMap["face_rpn_landmark_pred_stride32_weight"], weightMap["face_rpn_landmark_pred_stride32_bias"]); auto rf_head2_s32 = head(network, weightMap, *rf_c3_lateral->getOutput(0), "rf_head2_stride32"); ILayer *type_score_s32 = network->addConvolutionNd(*rf_head2_s32->getOutput(0), 6, DimsHW{1, 1}, weightMap["face_rpn_type_score_stride32_weight"], weightMap["face_rpn_type_score_stride32_bias"]); type_score_s32 = reshapeSoftmax(network, *type_score_s32->getOutput(0), 6); float *deval = reinterpret_cast(malloc(sizeof(float) * 64 * 2 * 2)); for (int i = 0; i < 64 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts{DataType::kFLOAT, deval, 64 * 2 * 2}; IDeconvolutionLayer* c3_deconv = network->addDeconvolutionNd(*rf_c3_lateral->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts); assert(c3_deconv); c3_deconv->setStrideNd(DimsHW{2, 2}); c3_deconv->setNbGroups(64); weightMap["c3_deconv"] = deconvwts; auto rf_c2_lateral = convBiasBnRelu(network, weightMap, *conv12->getOutput(0), 64, 1, 1, 0, "rf_c2_lateral"); auto plus0 = network->addElementWise(*c3_deconv->getOutput(0), *rf_c2_lateral->getOutput(0), ElementWiseOperation::kSUM); auto rf_c2_aggr = convBiasBnRelu(network, weightMap, *plus0->getOutput(0), 64, 3, 1, 1, "rf_c2_aggr"); auto rf_head_s16 = head(network, weightMap, *rf_c2_aggr->getOutput(0), "rf_head_stride16"); ILayer *cls_score_s16 = network->addConvolutionNd(*rf_head_s16->getOutput(0), 4, DimsHW{1, 1}, weightMap["face_rpn_cls_score_stride16_weight"], weightMap["face_rpn_cls_score_stride16_bias"]); cls_score_s16 = reshapeSoftmax(network, *cls_score_s16->getOutput(0), 4); auto bbox_s16 = network->addConvolutionNd(*rf_head_s16->getOutput(0), 8, DimsHW{1, 1}, weightMap["face_rpn_bbox_pred_stride16_weight"], weightMap["face_rpn_bbox_pred_stride16_bias"]); auto landmark_s16 = network->addConvolutionNd(*rf_head_s16->getOutput(0), 20, DimsHW{1, 1}, weightMap["face_rpn_landmark_pred_stride16_weight"], weightMap["face_rpn_landmark_pred_stride16_bias"]); auto rf_head2_s16 = head(network, weightMap, *rf_c2_aggr->getOutput(0), "rf_head2_stride16"); ILayer *type_score_s16 = network->addConvolutionNd(*rf_head2_s16->getOutput(0), 6, DimsHW{1, 1}, weightMap["face_rpn_type_score_stride16_weight"], weightMap["face_rpn_type_score_stride16_bias"]); type_score_s16 = reshapeSoftmax(network, *type_score_s16->getOutput(0), 6); IDeconvolutionLayer* c2_deconv = network->addDeconvolutionNd(*rf_c2_aggr->getOutput(0), 64, DimsHW{2, 2}, deconvwts, emptywts); assert(c2_deconv); c2_deconv->setStrideNd(DimsHW{2, 2}); c2_deconv->setNbGroups(64); auto rf_c1_red = convBiasBnRelu(network, weightMap, *conv6->getOutput(0), 64, 1, 1, 0, "rf_c1_red_conv"); auto plus1 = network->addElementWise(*c2_deconv->getOutput(0), *rf_c1_red->getOutput(0), ElementWiseOperation::kSUM); auto rf_c1_aggr = convBiasBnRelu(network, weightMap, *plus1->getOutput(0), 64, 3, 1, 1, "rf_c1_aggr"); auto rf_head_s8 = head(network, weightMap, *rf_c1_aggr->getOutput(0), "rf_head_stride8"); ILayer *cls_score_s8 = network->addConvolutionNd(*rf_head_s8->getOutput(0), 4, DimsHW{1, 1}, weightMap["face_rpn_cls_score_stride8_weight"], weightMap["face_rpn_cls_score_stride8_bias"]); cls_score_s8 = reshapeSoftmax(network, *cls_score_s8->getOutput(0), 4); auto bbox_s8 = network->addConvolutionNd(*rf_head_s8->getOutput(0), 8, DimsHW{1, 1}, weightMap["face_rpn_bbox_pred_stride8_weight"], weightMap["face_rpn_bbox_pred_stride8_bias"]); auto landmark_s8 = network->addConvolutionNd(*rf_head_s8->getOutput(0), 20, DimsHW{1, 1}, weightMap["face_rpn_landmark_pred_stride8_weight"], weightMap["face_rpn_landmark_pred_stride8_bias"]); auto rf_head2_s8 = head(network, weightMap, *rf_c1_aggr->getOutput(0), "rf_head2_stride8"); ILayer *type_score_s8 = network->addConvolutionNd(*rf_head2_s8->getOutput(0), 6, DimsHW{1, 1}, weightMap["face_rpn_type_score_stride8_weight"], weightMap["face_rpn_type_score_stride8_bias"]); type_score_s8 = reshapeSoftmax(network, *type_score_s8->getOutput(0), 6); ITensor* inputTensors_s32[] = {cls_score_s32->getOutput(0), bbox_s32->getOutput(0), landmark_s32->getOutput(0), type_score_s32->getOutput(0)}; auto cat_s32 = network->addConcatenation(inputTensors_s32, 4); assert(cat_s32); ITensor* inputTensors_s16[] = {cls_score_s16->getOutput(0), bbox_s16->getOutput(0), landmark_s16->getOutput(0), type_score_s16->getOutput(0)}; auto cat_s16 = network->addConcatenation(inputTensors_s16, 4); assert(cat_s16); ITensor* inputTensors_s8[] = {cls_score_s8->getOutput(0), bbox_s8->getOutput(0), landmark_s8->getOutput(0), type_score_s8->getOutput(0)}; auto cat_s8 = network->addConcatenation(inputTensors_s8, 4); assert(cat_s8); auto creator = getPluginRegistry()->getPluginCreator("Decode_TRT", "1"); PluginFieldCollection pfc; IPluginV2 *pluginObj = creator->createPlugin("decode", &pfc); ITensor* inputTensors[] = {cat_s8->getOutput(0), cat_s16->getOutput(0), cat_s32->getOutput(0)}; auto decodelayer = network->addPluginV2(inputTensors, 3, *pluginObj); assert(decodelayer); decodelayer->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*decodelayer->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("retinafaceAntiCov.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 2 && std::string(argv[1]) == "-d") { std::ifstream file("retinafaceAntiCov.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./retinafaceAntiCov -s // serialize model to plan file" << std::endl; std::cerr << "./retinafaceAntiCov -d // deserialize plan file and run inference" << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; cv::Mat img = cv::imread("test.jpg"); cv::Mat pr_img = preprocess_img(img); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = ((float)pr_img.at(i)[2] - 127.5) * 0.0078125; data[i + INPUT_H * INPUT_W] = ((float)pr_img.at(i)[1] - 127.5) * 0.0078125; data[i + 2 * INPUT_H * INPUT_W] = ((float)pr_img.at(i)[0] - 127.5) * 0.0078125; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector res; nms(res, prob); for (size_t j = 0; j < res.size(); j++) { //if (res[j].class_confidence < 0.1) continue; cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].landmark); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, "face: " + std::to_string((int)(res[j].class_confidence * 100)) + "%", cv::Point(r.x, r.y + 20), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1); for (int k = 0; k < 10; k += 2) { cv::circle(img, cv::Point(res[j].landmark[k], res[j].landmark[k + 1]), 1, cv::Scalar(255 * (k > 2), 255 * (k > 0 && k < 8), 255 * (k < 6)), 4); } cv::putText(img, "mask: " + std::to_string((int)(res[j].mask_confidence * 100)) + "%", cv::Point(r.x, r.y + 40), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0x00, 0x00, 0xFF), 1); } cv::imwrite("out.jpg", img); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); //Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: scaled-yolov4/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolov4) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(yolov4csp ${PROJECT_SOURCE_DIR}/yolov4_csp.cpp) target_link_libraries(yolov4csp nvinfer) target_link_libraries(yolov4csp cudart) target_link_libraries(yolov4csp myplugins) target_link_libraries(yolov4csp ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: scaled-yolov4/README.md ================================================ # scaled-yolov4 The Pytorch implementation is from [WongKinYiu/ScaledYOLOv4 yolov4-csp branch](https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-csp). It can load yolov4-csp.cfg and yolov4-csp.weights(from AlexeyAB/darknet). Note: There is a slight difference in yolov4-csp.cfg for darknet and pytorch. Use the one given in the above repo. ## Config - Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h - Number of classes `CLASS_NUM` defined in yololayer.h - FP16/FP32 can be selected by the macro `USE_FP16` in yolov4_csp.cpp - GPU id can be selected by the macro `DEVICE` in yolov4_csp.cpp - NMS thresh `NMS_THRESH` in yolov4_csp.cpp - bbox confidence threshold `BBOX_CONF_THRESH` in yolov4_csp.cpp - `BATCH_SIZE` in yolov4_csp.cpp ## How to run 1. generate yolov4_csp.wts from pytorch implementation with yolov4-csp.cfg and yolov4-csp.weights. ``` git clone https://github.com/wang-xinyu/tensorrtx.git git clone -b yolov4-csp https://github.com/WongKinYiu/ScaledYOLOv4.git // download yolov4-csp.weights from https://github.com/WongKinYiu/ScaledYOLOv4/tree/yolov4-csp#yolov4-csp cp {tensorrtx}/scaled-yolov4/gen_wts.py {ScaledYOLOv4/} cd {ScaledYOLOv4/} python gen_wts.py yolov4-csp.weights // a file 'yolov4_csp.wts' will be generated. ``` 2. put yolov4_csp.wts into {tensorrtx}/scaled-yolov4, build and run ``` mv yolov4_csp.wts {tensorrtx}/scaled-yolov4/ cd {tensorrtx}/scaled-yolov4 mkdir build cd build cmake .. make sudo ./yolov4csp -s // serialize model to plan file i.e. 'yolov4csp.engine' sudo ./yolov4csp -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed. ``` 3. check the images generated, as follows. _zidane.jpg and _bus.jpg

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: scaled-yolov4/common.hpp ================================================ #include #include #include #include #include #include "NvInfer.h" #include "yololayer.h" #include "mish.h" using namespace nvinfer1; cv::Mat preprocess_img(cv::Mat& img) { int w, h, x, y; float r_w = Yolo::INPUT_W / (img.cols*1.0); float r_h = Yolo::INPUT_H / (img.rows*1.0); if (r_h > r_w) { w = Yolo::INPUT_W; h = r_w * img.rows; x = 0; y = (Yolo::INPUT_H - h) / 2; } else { w = r_h* img.cols; h = Yolo::INPUT_H; x = (Yolo::INPUT_W - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size()); cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } cv::Rect get_rect(cv::Mat& img, float bbox[4]) { int l, r, t, b; float r_w = Yolo::INPUT_W / (img.cols * 1.0); float r_h = Yolo::INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2]/2.f; r = bbox[0] + bbox[2]/2.f; t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; t = bbox[1] - bbox[3]/2.f; b = bbox[1] + bbox[3]/2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(l, t, r-l, b-t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.det_confidence > b.det_confidence; } void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { int det_size = sizeof(Yolo::Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Yolo::Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnMish(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4); auto creator = getPluginRegistry()->getPluginCreator("Mish_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin(("mish" + std::to_string(linx)).c_str(), pluginData); ITensor* inputTensors[] = {bn1->getOutput(0)}; auto mish = network->addPluginV2(&inputTensors[0], 1, *pluginObj); return mish; } ================================================ FILE: scaled-yolov4/gen_wts.py ================================================ import struct import sys from models.models import * from utils import * model = Darknet('models/yolov4-csp.cfg', (512, 512)) weights = sys.argv[1] device = torch_utils.select_device('0') if weights.endswith('.pt'): # pytorch format model.load_state_dict(torch.load(weights, map_location=device)['model']) else: # darknet format load_darknet_weights(model, weights) with open('yolov4_csp.wts', 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') ================================================ FILE: scaled-yolov4/logging.h ================================================ /* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) , mPrefix(other.mPrefix) , mShouldLog(other.mShouldLog) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) { ss << " "; } ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR //! ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: scaled-yolov4/mish.cu ================================================ #include #include #include #include #include "mish.h" namespace nvinfer1 { MishPlugin::MishPlugin() { } MishPlugin::~MishPlugin() { } // create the plugin at runtime from a byte stream MishPlugin::MishPlugin(const void* data, size_t length) { assert(length == sizeof(input_size_)); input_size_ = *reinterpret_cast(data); } void MishPlugin::serialize(void* buffer) const { *reinterpret_cast(buffer) = input_size_; } size_t MishPlugin::getSerializationSize() const { return sizeof(input_size_); } int MishPlugin::initialize() { return 0; } Dims MishPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { assert(nbInputDims == 1); assert(index == 0); input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2]; // Output dimensions return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); } // Set plugin namespace void MishPlugin::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* MishPlugin::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType MishPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool MishPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool MishPlugin::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void MishPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void MishPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void MishPlugin::detachFromContext() {} const char* MishPlugin::getPluginType() const { return "Mish_TRT"; } const char* MishPlugin::getPluginVersion() const { return "1"; } void MishPlugin::destroy() { delete this; } // Clone the plugin IPluginV2IOExt* MishPlugin::clone() const { MishPlugin *p = new MishPlugin(); p->input_size_ = input_size_; p->setPluginNamespace(mPluginNamespace); return p; } __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);} __device__ float softplus_kernel(float x, float threshold = 20) { if (x > threshold) return x; // too large else if (x < -threshold) return expf(x); // too small return logf(expf(x) + 1); } __global__ void mish_kernel(const float *input, float *output, int num_elem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= num_elem) return; //float t = exp(input[idx]); //if (input[idx] > 20.0) { // t *= t; // output[idx] = (t - 1.0) / (t + 1.0); //} else { // float tt = t * t; // output[idx] = (tt + 2.0 * t) / (tt + 2.0 * t + 2.0); //} //output[idx] *= input[idx]; output[idx] = input[idx] * tanh_activate_kernel(softplus_kernel(input[idx])); } void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int block_size = thread_count_; int grid_size = (input_size_ * batchSize + block_size - 1) / block_size; mish_kernel<<>>(inputs[0], output, input_size_ * batchSize); } int MishPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection MishPluginCreator::mFC{}; std::vector MishPluginCreator::mPluginAttributes; MishPluginCreator::MishPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* MishPluginCreator::getPluginName() const { return "Mish_TRT"; } const char* MishPluginCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* MishPluginCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* MishPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { MishPlugin* obj = new MishPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* MishPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call MishPlugin::destroy() MishPlugin* obj = new MishPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: scaled-yolov4/mish.h ================================================ #ifndef TRTX_MISH_PLUGIN_H #define TRTX_MISH_PLUGIN_H #include #include #include "NvInfer.h" namespace nvinfer1 { class MishPlugin: public IPluginV2IOExt { public: explicit MishPlugin(); MishPlugin(const void* data, size_t length); ~MishPlugin(); int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; int input_size_; private: void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1); int thread_count_ = 256; const char* mPluginNamespace; }; class MishPluginCreator : public IPluginCreator { public: MishPluginCreator(); ~MishPluginCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(MishPluginCreator); }; #endif // TRTX_MISH_PLUGIN_H ================================================ FILE: scaled-yolov4/utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } #endif ================================================ FILE: scaled-yolov4/yololayer.cu ================================================ #include #include "yololayer.h" #include "utils.h" using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin() { mClassCount = CLASS_NUM; mYoloKernel.clear(); mYoloKernel.push_back(yolo1); mYoloKernel.push_back(yolo2); mYoloKernel.push_back(yolo3); mKernelCount = mYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; for(int ii = 0; ii < mKernelCount; ii ++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(mYoloKernel.data(),d,kernelSize); d += kernelSize; CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; for(int ii = 0; ii < mKernelCount; ii ++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(d,mYoloKernel.data(),kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); } int YoloLayerPlugin::initialize() { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { //output the result to channel int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() {} const char* YoloLayerPlugin::getPluginType() const { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const { return "1"; } void YoloLayerPlugin::destroy() { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const { YoloLayerPlugin *p = new YoloLayerPlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1./(1. + exp(-data)); }; __global__ void CalDetection(const float *input, float *output, int noElements, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT*2],int classes,int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid*bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < 3; ++k) { int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue; float *res_count = output + bnIdx*outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= MAX_OUTPUT_BBOX_COUNT) return; char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection); Detection* det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location det->bbox[0] = (col + (2 * (Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid]))) - 0.5) * INPUT_W / yoloWidth; det->bbox[1] = (row + (2 * (Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid]))) - 0.5) * INPUT_H / yoloHeight; det->bbox[2] = (powf(2 * (Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid])), 2)) * anchors[2*k]; det->bbox[3] = (powf(2 * (Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid])), 2)) * anchors[2*k + 1]; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } } void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); for(int idx = 0 ; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); } int numElem = 0; for (unsigned int i = 0;i< mYoloKernel.size();++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width*yolo.height*batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem); } } int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { YoloLayerPlugin* obj = new YoloLayerPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call MishPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: scaled-yolov4/yololayer.h ================================================ #ifndef _YOLO_LAYER_H #define _YOLO_LAYER_H #include #include #include "NvInfer.h" namespace Yolo { static constexpr int CHECK_COUNT = 3; static constexpr float IGNORE_THRESH = 0.1f; static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 80; static constexpr int INPUT_H = 512; static constexpr int INPUT_W = 512; struct YoloKernel { int width; int height; float anchors[CHECK_COUNT*2]; }; static constexpr YoloKernel yolo1 = { INPUT_W / 8, INPUT_H / 8, {12,16, 19,36, 40,28} }; static constexpr YoloKernel yolo2 = { INPUT_W / 16, INPUT_H / 16, {36,75, 76,55, 72,146} }; static constexpr YoloKernel yolo3 = { INPUT_W / 32, INPUT_H / 32, {142,110, 192,243, 459,401} }; static constexpr int LOCATIONS = 4; struct alignas(float) Detection{ //x y w h float bbox[LOCATIONS]; float det_confidence; float class_id; float class_confidence; }; } namespace nvinfer1 { class YoloLayerPlugin: public IPluginV2IOExt { public: explicit YoloLayerPlugin(); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream, int batchSize = 1); int mClassCount; int mKernelCount; std::vector mYoloKernel; int mThreadCount = 256; void** mAnchor; const char* mPluginNamespace; }; class YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif ================================================ FILE: scaled-yolov4/yolov4_csp.cpp ================================================ #include #include #include #include "logging.h" #include "utils.h" #include "cuda_runtime_api.h" #include "common.hpp" #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.4 #define BBOX_CONF_THRESH 0.5 #define BATCH_SIZE 1 // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float); static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder -> createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network -> addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../yolov4_csp.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // define yolov4 csp layers auto l0 = convBnMish(network, weightMap, *data, 32, 3, 1, 1, 0); auto l1 = convBnMish(network, weightMap, *l0 -> getOutput(0), 64, 3, 2, 1, 1); auto l2 = convBnMish(network, weightMap, *l1 -> getOutput(0), 32, 1, 1, 0, 2); auto l3 = convBnMish(network, weightMap, *l2 -> getOutput(0), 64, 3, 1, 1, 3); auto ew4 = network -> addElementWise(*l3 -> getOutput(0), *l1 -> getOutput(0), ElementWiseOperation::kSUM); auto l5 = convBnMish(network, weightMap, *ew4 -> getOutput(0), 128, 3, 2, 1, 5); auto l6 = convBnMish(network, weightMap, *l5 -> getOutput(0), 64, 1, 1, 0, 6); auto l7 = l5; auto l8 = convBnMish(network, weightMap, *l7 -> getOutput(0), 64, 1, 1, 0, 8); auto l9 = convBnMish(network, weightMap, *l8 -> getOutput(0), 64, 1, 1, 0, 9); auto l10 = convBnMish(network, weightMap, *l9 -> getOutput(0), 64, 3, 1, 1, 10); auto ew11 = network -> addElementWise(*l10 -> getOutput(0), *l8 -> getOutput(0), ElementWiseOperation::kSUM); auto l12 = convBnMish(network, weightMap, *ew11 -> getOutput(0), 64, 1, 1, 0, 12); auto l13 = convBnMish(network, weightMap, *l12 -> getOutput(0), 64, 3, 1, 1, 13); auto ew14 = network -> addElementWise(*l13 -> getOutput(0), *ew11 -> getOutput(0), ElementWiseOperation::kSUM); auto l15 = convBnMish(network, weightMap, *ew14 -> getOutput(0), 64, 1, 1, 0, 15); ITensor* inputTensors16[] = {l15 -> getOutput(0), l6 -> getOutput(0)}; auto cat16 = network -> addConcatenation(inputTensors16, 2); auto l17 = convBnMish(network, weightMap, *cat16 -> getOutput(0), 128, 1, 1, 0, 17); auto l18 = convBnMish(network, weightMap, *l17 -> getOutput(0), 256, 3, 2, 1, 18); auto l19 = convBnMish(network, weightMap, *l18 -> getOutput(0), 128, 1, 1, 0, 19); auto l20 = l18; auto l21 = convBnMish(network, weightMap, *l20 -> getOutput(0), 128, 1, 1, 0, 21); auto l22 = convBnMish(network, weightMap, *l21 -> getOutput(0), 128, 1, 1, 0, 22); auto l23 = convBnMish(network, weightMap, *l22 -> getOutput(0), 128, 3, 1, 1, 23); auto ew24 = network -> addElementWise(*l23 -> getOutput(0), *l21 -> getOutput(0), ElementWiseOperation::kSUM); auto l25 = convBnMish(network, weightMap, *ew24 -> getOutput(0), 128, 1, 1, 0, 25); auto l26 = convBnMish(network, weightMap, *l25 -> getOutput(0), 128, 3, 1, 1, 26); auto ew27 = network -> addElementWise(*l26 -> getOutput(0), *ew24 -> getOutput(0), ElementWiseOperation::kSUM); auto l28 = convBnMish(network, weightMap, *ew27 -> getOutput(0), 128, 1, 1, 0, 28); auto l29 = convBnMish(network, weightMap, *l28 -> getOutput(0), 128, 3, 1, 1, 29); auto ew30 = network -> addElementWise(*l29 -> getOutput(0), *ew27 -> getOutput(0), ElementWiseOperation::kSUM); auto l31 = convBnMish(network, weightMap, *ew30 -> getOutput(0), 128, 1, 1, 0, 31); auto l32 = convBnMish(network, weightMap, *l31 -> getOutput(0), 128, 3, 1, 1, 32); auto ew33 = network -> addElementWise(*l32 -> getOutput(0), *ew30 -> getOutput(0), ElementWiseOperation::kSUM); auto l34 = convBnMish(network, weightMap, *ew33 -> getOutput(0), 128, 1, 1, 0, 34); auto l35 = convBnMish(network, weightMap, *l34 -> getOutput(0), 128, 3, 1, 1, 35); auto ew36 = network -> addElementWise(*l35 -> getOutput(0), *ew33 -> getOutput(0), ElementWiseOperation::kSUM); auto l37 = convBnMish(network, weightMap, *ew36 -> getOutput(0), 128, 1, 1, 0, 37); auto l38 = convBnMish(network, weightMap, *l37 -> getOutput(0), 128, 3, 1, 1, 38); auto ew39 = network -> addElementWise(*l38 -> getOutput(0), *ew36 -> getOutput(0), ElementWiseOperation::kSUM); auto l40 = convBnMish(network, weightMap, *ew39 -> getOutput(0), 128, 1, 1, 0, 40); auto l41 = convBnMish(network, weightMap, *l40 -> getOutput(0), 128, 3, 1, 1, 41); auto ew42 = network -> addElementWise(*l41 -> getOutput(0), *ew39 -> getOutput(0), ElementWiseOperation::kSUM); auto l43 = convBnMish(network, weightMap, *ew42 -> getOutput(0), 128, 1, 1, 0, 43); auto l44 = convBnMish(network, weightMap, *l43 -> getOutput(0), 128, 3, 1, 1, 44); auto ew45 = network -> addElementWise(*l44 -> getOutput(0), *ew42 -> getOutput(0), ElementWiseOperation::kSUM); auto l46 = convBnMish(network, weightMap, *ew45 -> getOutput(0), 128, 1, 1, 0, 46); ITensor* inputTensors47[] = {l46 -> getOutput(0), l19 -> getOutput(0)}; auto cat47 = network -> addConcatenation(inputTensors47, 2); auto l48 = convBnMish(network, weightMap, *cat47 -> getOutput(0), 256, 1, 1, 0, 48); auto l49 = convBnMish(network, weightMap, *l48 -> getOutput(0), 512, 3, 2, 1, 49); auto l50 = convBnMish(network, weightMap, *l49 -> getOutput(0), 256, 1, 1, 0, 50); auto l51 = l49; auto l52 = convBnMish(network, weightMap, *l51 -> getOutput(0), 256, 1, 1, 0, 52); auto l53 = convBnMish(network, weightMap, *l52 -> getOutput(0), 256, 1, 1, 0, 53); auto l54 = convBnMish(network, weightMap, *l53 -> getOutput(0), 256, 3, 1, 1, 54); auto ew55 = network -> addElementWise(*l54 -> getOutput(0), *l52 -> getOutput(0), ElementWiseOperation::kSUM); auto l56 = convBnMish(network, weightMap, *ew55 -> getOutput(0), 256, 1, 1, 0, 56); auto l57 = convBnMish(network, weightMap, *l56 -> getOutput(0), 256, 3, 1, 1, 57); auto ew58 = network -> addElementWise(*l57 -> getOutput(0), *ew55 -> getOutput(0), ElementWiseOperation::kSUM); auto l59 = convBnMish(network, weightMap, *ew58 -> getOutput(0), 256, 1, 1, 0, 59); auto l60 = convBnMish(network, weightMap, *l59 -> getOutput(0), 256, 3, 1, 1, 60); auto ew61 = network -> addElementWise(*l60 -> getOutput(0), *ew58 -> getOutput(0), ElementWiseOperation::kSUM); auto l62 = convBnMish(network, weightMap, *ew61 -> getOutput(0), 256, 1, 1, 0, 62); auto l63 = convBnMish(network, weightMap, *l62 -> getOutput(0), 256, 3, 1, 1, 63); auto ew64 = network -> addElementWise(*l63 -> getOutput(0), *ew61 -> getOutput(0), ElementWiseOperation::kSUM); auto l65 = convBnMish(network, weightMap, *ew64 -> getOutput(0), 256, 1, 1, 0, 65); auto l66 = convBnMish(network, weightMap, *l65 -> getOutput(0), 256, 3, 1, 1, 66); auto ew67 = network -> addElementWise(*l66 -> getOutput(0), *ew64 -> getOutput(0), ElementWiseOperation::kSUM); auto l68 = convBnMish(network, weightMap, *ew67 -> getOutput(0), 256, 1, 1, 0, 68); auto l69 = convBnMish(network, weightMap, *l68 -> getOutput(0), 256, 3, 1, 1, 69); auto ew70 = network -> addElementWise(*l69 -> getOutput(0), *ew67 -> getOutput(0), ElementWiseOperation::kSUM); auto l71 = convBnMish(network, weightMap, *ew70 -> getOutput(0), 256, 1, 1, 0, 71); auto l72 = convBnMish(network, weightMap, *l71 -> getOutput(0), 256, 3, 1, 1, 72); auto ew73 = network -> addElementWise(*l72 -> getOutput(0), *ew70 -> getOutput(0), ElementWiseOperation::kSUM); auto l74 = convBnMish(network, weightMap, *ew73 -> getOutput(0), 256, 1, 1, 0, 74); auto l75 = convBnMish(network, weightMap, *l74 -> getOutput(0), 256, 3, 1, 1, 75); auto ew76 = network -> addElementWise(*l75 -> getOutput(0), *ew73 -> getOutput(0), ElementWiseOperation::kSUM); auto l77 = convBnMish(network, weightMap, *ew76 -> getOutput(0), 256, 1, 1, 0, 77); ITensor* inputTensors78[] = {l77 -> getOutput(0), l50 -> getOutput(0)}; auto cat78 = network -> addConcatenation(inputTensors78, 2); auto l79 = convBnMish(network, weightMap, *cat78 -> getOutput(0), 512, 1, 1, 0, 79); auto l80 = convBnMish(network, weightMap, *l79 -> getOutput(0), 1024, 3, 2, 1, 80); auto l81 = convBnMish(network, weightMap, *l80 -> getOutput(0), 512, 1, 1, 0, 81); auto l82 = l80; auto l83 = convBnMish(network, weightMap, *l82 -> getOutput(0), 512, 1, 1, 0, 83); auto l84 = convBnMish(network, weightMap, *l83 -> getOutput(0), 512, 1, 1, 0, 84); auto l85 = convBnMish(network, weightMap, *l84 -> getOutput(0), 512, 3, 1, 1, 85); auto ew86 = network -> addElementWise(*l85 -> getOutput(0), *l83 -> getOutput(0), ElementWiseOperation::kSUM); auto l87 = convBnMish(network, weightMap, *ew86 -> getOutput(0), 512, 1, 1, 0, 87); auto l88 = convBnMish(network, weightMap, *l87 -> getOutput(0), 512, 3, 1, 1, 88); auto ew89 = network -> addElementWise(*l88 -> getOutput(0), *ew86 -> getOutput(0), ElementWiseOperation::kSUM); auto l90 = convBnMish(network, weightMap, *ew89 -> getOutput(0), 512, 1, 1, 0, 90); auto l91 = convBnMish(network, weightMap, *l90 -> getOutput(0), 512, 3, 1, 1, 91); auto ew92 = network -> addElementWise(*l91 -> getOutput(0), *ew89 -> getOutput(0), ElementWiseOperation::kSUM); auto l93 = convBnMish(network, weightMap, *ew92 -> getOutput(0), 512, 1, 1, 0, 93); auto l94 = convBnMish(network, weightMap, *l93 -> getOutput(0), 512, 3, 1, 1, 94); auto ew95 = network -> addElementWise(*l94 -> getOutput(0), *ew92 -> getOutput(0), ElementWiseOperation::kSUM); auto l96 = convBnMish(network, weightMap, *ew95 -> getOutput(0), 512, 1, 1, 0, 96); ITensor* inputTensors97[] = {l96 -> getOutput(0), l81 -> getOutput(0)}; auto cat97 = network -> addConcatenation(inputTensors97, 2); auto l98 = convBnMish(network, weightMap, *cat97 -> getOutput(0), 1024, 1, 1, 0, 98); // ---- auto l99 = convBnMish(network, weightMap, *l98 -> getOutput(0), 512, 1, 1, 0, 99); auto l100 = l98; auto l101 = convBnMish(network, weightMap, *l100 -> getOutput(0), 512, 1, 1, 0, 101); auto l102 = convBnMish(network, weightMap, *l101 -> getOutput(0), 512, 3, 1, 1, 102); auto l103 = convBnMish(network, weightMap, *l102 -> getOutput(0), 512, 1, 1, 0, 103); auto pool104 = network -> addPoolingNd(*l103 -> getOutput(0), PoolingType::kMAX, DimsHW{5, 5}); pool104 -> setPaddingNd(DimsHW{2, 2}); pool104 -> setStrideNd(DimsHW{1, 1}); auto l105 = l103; auto pool106 = network -> addPoolingNd(*l105 -> getOutput(0), PoolingType::kMAX, DimsHW{9, 9}); pool106 -> setPaddingNd(DimsHW{4, 4}); pool106 -> setStrideNd(DimsHW{1, 1}); auto l107 = l103; auto pool108 = network -> addPoolingNd(*l107 -> getOutput(0), PoolingType::kMAX, DimsHW{13, 13}); pool108 -> setPaddingNd(DimsHW{6, 6}); pool108 -> setStrideNd(DimsHW{1, 1}); ITensor* inputTensors109[] = {pool108 -> getOutput(0), pool106 -> getOutput(0), pool104 -> getOutput(0), l103 -> getOutput(0)}; auto cat109 = network -> addConcatenation(inputTensors109, 4); // ---- end spp auto l110 = convBnMish(network, weightMap, *cat109 -> getOutput(0), 512, 1, 1, 0, 110); auto l111 = convBnMish(network, weightMap, *l110 -> getOutput(0), 512, 3, 1, 1, 111); ITensor* inputTensors112[] = { l111 -> getOutput(0), l99 -> getOutput(0) }; auto cat112 = network -> addConcatenation(inputTensors112, 2); auto l113 = convBnMish(network, weightMap, *cat112 -> getOutput(0), 512, 1, 1, 0, 113); auto l114 = convBnMish(network, weightMap, *l113 -> getOutput(0), 256, 1, 1, 0, 114); float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); for (int i = 0; i < 256 * 2 * 2; i++) { deval[i] = 1.0; } Weights upsamplewts115{DataType::kFLOAT, deval, 256 * 2 * 2}; IDeconvolutionLayer* upsample115 = network -> addDeconvolutionNd(*l114 -> getOutput(0), 256, DimsHW{2, 2}, upsamplewts115, emptywts); assert(upsample115); upsample115 -> setStrideNd(DimsHW{2, 2}); upsample115 -> setNbGroups(256); weightMap["upsample115"] = upsamplewts115; auto l116 = l79; auto l117 = convBnMish(network, weightMap, *l116 -> getOutput(0), 256, 1, 1, 0, 117); ITensor* inputTensors118[] = {l117 -> getOutput(0), upsample115 -> getOutput(0)}; auto cat118 = network -> addConcatenation(inputTensors118, 2); auto l119 = convBnMish(network, weightMap, *cat118 -> getOutput(0), 256, 1, 1, 0, 119); auto l120 = convBnMish(network, weightMap, *l119 -> getOutput(0), 256, 1, 1, 0, 120); auto l121 = l119; auto l122 = convBnMish(network, weightMap, *l121 -> getOutput(0), 256, 1, 1, 0, 122); auto l123 = convBnMish(network, weightMap, *l122 -> getOutput(0), 256, 3, 1, 1, 123); auto l124 = convBnMish(network, weightMap, *l123 -> getOutput(0), 256, 1, 1, 0, 124); auto l125 = convBnMish(network, weightMap, *l124 -> getOutput(0), 256, 3, 1, 1, 125); ITensor* inputTensors126[] = {l125 -> getOutput(0), l120 -> getOutput(0)}; auto cat126 = network -> addConcatenation(inputTensors126, 2); auto l127 = convBnMish(network, weightMap, *cat126 -> getOutput(0), 256, 1, 1, 0, 127); auto l128 = convBnMish(network, weightMap, *l127 -> getOutput(0), 128, 1, 1, 0, 128); Weights upsamplewts129{DataType::kFLOAT, deval, 128 * 2 * 2}; IDeconvolutionLayer* upsample129 = network -> addDeconvolutionNd(*l128 -> getOutput(0), 128, DimsHW{2, 2}, upsamplewts129, emptywts); assert(upsample129); upsample129 -> setStrideNd(DimsHW{2, 2}); upsample129 -> setNbGroups(128); auto l130 = l48; auto l131 = convBnMish(network, weightMap, *l130 -> getOutput(0), 128, 1, 1, 0, 131); ITensor* inputTensors132[] = {l131 -> getOutput(0), upsample129 -> getOutput(0)}; auto cat132 = network -> addConcatenation(inputTensors132, 2); auto l133 = convBnMish(network, weightMap, *cat132 -> getOutput(0), 128, 1, 1, 0, 133); auto l134 = convBnMish(network, weightMap, *l133 -> getOutput(0), 128, 1, 1, 0, 134); auto l135 = l133; auto l136 = convBnMish(network, weightMap, *l135 -> getOutput(0), 128, 1, 1, 0, 136); auto l137 = convBnMish(network, weightMap, *l136 -> getOutput(0), 128, 3, 1, 1, 137); auto l138 = convBnMish(network, weightMap, *l137 -> getOutput(0), 128, 1, 1, 0, 138); auto l139 = convBnMish(network, weightMap, *l138 -> getOutput(0), 128, 3, 1, 1, 139); ITensor* inputTensors140[] = {l139 -> getOutput(0), l134 -> getOutput(0)}; auto cat140 = network -> addConcatenation(inputTensors140, 2); auto l141 = convBnMish(network, weightMap, *cat140 -> getOutput(0), 128, 1, 1, 0, 141); // --- auto l142 = convBnMish(network, weightMap, *l141 -> getOutput(0), 256, 3, 1, 1, 142); IConvolutionLayer* conv143 = network -> addConvolutionNd(*l142 -> getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.143.Conv2d.weight"], weightMap["module_list.143.Conv2d.bias"]); assert(conv143); // 144 is yolo layer auto l145 = l141; auto l146 = convBnMish(network, weightMap, *l145 -> getOutput(0), 256, 3, 2, 1, 146); ITensor* inputTensors147[] = {l146 -> getOutput(0), l127 -> getOutput(0)}; auto cat147 = network -> addConcatenation(inputTensors147, 2); auto l148 = convBnMish(network, weightMap, *cat147 -> getOutput(0), 256, 1, 1, 0, 148); auto l149 = convBnMish(network, weightMap, *l148 -> getOutput(0), 256, 1, 1, 0, 149); auto l150 = l148; auto l151 = convBnMish(network, weightMap, *l150 -> getOutput(0), 256, 1, 1, 0, 151); auto l152 = convBnMish(network, weightMap, *l151 -> getOutput(0), 256, 3, 1, 1, 152); auto l153 = convBnMish(network, weightMap, *l152 -> getOutput(0), 256, 1, 1, 0, 153); auto l154 = convBnMish(network, weightMap, *l153 -> getOutput(0), 256, 3, 1, 1, 154); ITensor* inputTensors155[] = {l154 -> getOutput(0), l149 -> getOutput(0)}; auto cat155 = network -> addConcatenation(inputTensors155, 2); auto l156 = convBnMish(network, weightMap, *cat155 -> getOutput(0), 256, 1, 1, 0, 156); auto l157 = convBnMish(network, weightMap, *l156 -> getOutput(0), 512, 3, 1, 1, 157); IConvolutionLayer* conv158 = network -> addConvolutionNd(*l157 -> getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.158.Conv2d.weight"], weightMap["module_list.158.Conv2d.bias"]); assert(conv158); // 159 is yolo layer auto l160 = l156; auto l161 = convBnMish(network, weightMap, *l160 -> getOutput(0), 512, 3, 2, 1, 161); ITensor* inputTensors162[] = {l161 -> getOutput(0), l113 -> getOutput(0)}; auto cat162 = network -> addConcatenation(inputTensors162, 2); auto l163 = convBnMish(network, weightMap, *cat162 -> getOutput(0), 512, 1, 1, 0, 163); auto l164 = convBnMish(network, weightMap, *l163 -> getOutput(0), 512, 1, 1, 0, 164); auto l165 = l163; auto l166 = convBnMish(network, weightMap, *l165 -> getOutput(0), 512, 1, 1, 0, 166); auto l167 = convBnMish(network, weightMap, *l166 -> getOutput(0), 512, 3, 1, 1, 167); auto l168 = convBnMish(network, weightMap, *l167 -> getOutput(0), 512, 1, 1, 0, 168); auto l169 = convBnMish(network, weightMap, *l168 -> getOutput(0), 512, 3, 1, 1, 169); ITensor* inputTensors170[] = {l169 -> getOutput(0), l164 -> getOutput(0)}; auto cat170 = network -> addConcatenation(inputTensors170, 2); auto l171 = convBnMish(network, weightMap, *cat170 -> getOutput(0), 512, 1, 1, 0, 171); auto l172 = convBnMish(network, weightMap, *l171 -> getOutput(0), 1024, 3, 1, 1, 172); IConvolutionLayer* conv173 = network -> addConvolutionNd(*l172 -> getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.173.Conv2d.weight"], weightMap["module_list.173.Conv2d.bias"]); assert(conv173); // 174 is yolo layer // add yolo plugin auto creator = getPluginRegistry() -> getPluginCreator("YoloLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator -> getFieldNames(); IPluginV2* pluginObj = creator -> createPlugin("yololayer", pluginData); ITensor* inputTensorsYolo[] = {conv143 -> getOutput(0), conv158 -> getOutput(0), conv173 -> getOutput(0)}; auto yolo = network -> addPluginV2(inputTensorsYolo, 3, *pluginObj); yolo -> getOutput(0) -> setName(OUTPUT_BLOB_NAME); network -> markOutput(*yolo -> getOutput(0)); // Build engine builder -> setMaxBatchSize(maxBatchSize); config -> setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config -> setFlag(BuilderFlag::kFP16); #endif std::cout << "Building tensorrt engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder -> buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network -> destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // create builder IBuilder* builder = createInferBuilder(gLogger); // create builder config IBuilderConfig* config = builder -> createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // serialize the trt engine (*modelStream) = engine -> serialize(); // Close everything down engine -> destroy(); builder -> destroy(); config -> destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char* p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file -> d_name, ".") != 0 && strcmp(p_file -> d_name, "..") != 0) { std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv){ cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("yolov4csp.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("yolov4csp.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov4 -s // serialize model to plan file" << std::endl; std::cerr << "./yolov4 -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; int fcount = 0; for (int f = 0; f < (int)file_names.size(); f++) { fcount++; if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); if (img.empty()) continue; cv::Mat pr_img = preprocess_img(img); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[b * 3 * INPUT_H * INPUT_W + i] = pr_img.at(i)[2] / 255.0; data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; } } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector> batch_res(fcount); for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; nms(res, &prob[b * OUTPUT_SIZE], BBOX_CONF_THRESH, NMS_THRESH); } for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; //std::cout << res.size() << std::endl; cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); for (size_t j = 0; j < res.size(); j++) { float *p = (float*)&res[j]; for (size_t k = 0; k < 7; k++) { std::cout << p[k] << ", "; } std::cout << std::endl; cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("_" + file_names[f - fcount + 1 + b], img); } fcount = 0; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); //Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: senet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(senet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(se_resnet ${PROJECT_SOURCE_DIR}/se_resnet50.cpp) target_link_libraries(se_resnet nvinfer) target_link_libraries(se_resnet cudart) add_definitions(-O2 -pthread) ================================================ FILE: senet/README.md ================================================ # SENet An implementation of SENet, proposed in Squeeze-and-Excitation Networks by Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu [https://arxiv.org/abs/1709.01507](https://arxiv.org/abs/1709.01507) For the Pytorch implementation, you can refer to [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch), which is forked from [moskomule/senet.pytorch](https://github.com/moskomule/senet.pytorch). ``` // 1. generate se_resnet50.wts from [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch) // 2. put se_resnet50.wts into tensorrtx/senet // 3. build and run cd tensorrtx/senet mkdir build cd build cmake .. make sudo ./se_resnet -s // serialize model to plan file i.e. 'se_resnet50.engine' sudo ./se_resnet -d // deserialize plan file and run inference // 4. see if the output is same as [wang-xinyu/senet.pytorch] ``` ================================================ FILE: senet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: senet/se_resnet50.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; std::cout << "len " << len << std::endl; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* seLayer(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c, int w, std::string lname) { IPoolingLayer* l1 = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW(w, w)); assert(l1); l1->setStrideNd(DimsHW{w, w}); IFullyConnectedLayer* l2 = network->addFullyConnected(*l1->getOutput(0), c / 16, weightMap[lname + "fc.0.weight"], weightMap[lname+"fc.0.bias"]); IActivationLayer* relu1 = network->addActivation(*l2->getOutput(0), ActivationType::kRELU); IFullyConnectedLayer* l4 = network->addFullyConnected(*relu1->getOutput(0), c, weightMap[lname+"fc.2.weight"],weightMap[lname+"fc.2.bias"]); IActivationLayer* l5 = network->addActivation(*l4->getOutput(0), ActivationType::kSIGMOID); ILayer* se = network->addElementWise(input, *l5->getOutput(0), ElementWiseOperation::kPROD); assert(se); return se; } IActivationLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname, int w) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStrideNd(DimsHW{stride, stride}); conv2->setPaddingNd(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolutionNd(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); ILayer *se = seLayer(network, weightMap, *bn3->getOutput(0), outch * 4, w, lname + "se."); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 4) { IConvolutionLayer* conv4 = network->addConvolutionNd(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *se->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *se->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../se_resnet50.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); conv1->setPaddingNd(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.", 56); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.", 56); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.", 56); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.", 28); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.", 28); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.", 28); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.", 28); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.", 14); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.", 14); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.", 14); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.", 14); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.", 14); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.", 14); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.", 7); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.", 7); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.", 7); IPoolingLayer* pool2 = network->addPoolingNd(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{7, 7}); assert(pool2); pool2->setStrideNd(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 1000, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./se_resnet -s // serialize model to plan file" << std::endl; std::cerr << "./se_resnet -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("se_resnet50.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("se_resnet50.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 10; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { std::cout << prob[i] << ", "; if (i % 10 == 0) std::cout << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: shufflenetv2/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) project( shufflenetv2 VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 60 70 72 75 80 86 89) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) endif() add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cpp) target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads CUDA::cudart TensorRT::TensorRT ${OpenCV_LIBS}) ================================================ FILE: shufflenetv2/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message( FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!" ) endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: shufflenetv2/README.md ================================================ # shufflenet v2 ShuffleNetV2 with 0.5x output channels, as described in: [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164) Following tricks are used in this demo: - `torch.chunk` is used in shufflenet v2. We implemented the `chunk(2, dim=C)` by tensorrt plugin. Which is the simplest plugin in this tensorrtx project. You can learn the basic procedures of build tensorrt plugin. - shuffle layer is used, the `channel_shuffle()` in `pytorchx/shufflenet` can be implemented by two shuffle layers in tensorrt. - Batchnorm layer, implemented by scale layer. ## Usage 1. use `gen_wts.py` to generate wts file. ```bash python3 gen_wts.py ``` 2. build C++ code ```bash pushd tensorrtx/shufflenetv2 cmake -S . -B build -G Ninja --fresh cmake --build build ``` 3. serialize wts model to engine file. ```bash ./build/shufflenetv2 -s ``` 4. run inference ```bash ./build/shufflenetv2 -i ``` The inference output looks like: ```bash ... 328us -5.481, -0.1151, 4.004, -1.47, 1.007, -5.943, -2.311, 1.708, 1.569, 0.3112, 1.589, 0.1816, -2.253, -3.261, -3.269, -0.9116, -2.132, -1.159, -2.108, -0.3869, -4.653, ==== ... prediction result: Top: 0 idx: 285, logits: 10.44, label: Egyptian cat Top: 1 idx: 309, logits: 10.19, label: bee Top: 2 idx: 94, logits: 9.399, label: hummingbird ``` ================================================ FILE: shufflenetv2/gen_wts.py ================================================ import struct import cv2 import numpy as np import torch from torchvision.models.shufflenetv2 import ( shufflenet_v2_x0_5, shufflenet_v2_x1_0, shufflenet_v2_x1_5, shufflenet_v2_x2_0, ) def read_imagenet_labels() -> dict[int, str]: """ read ImageNet 1000 labels Returns: dict[int, str]: labels dict """ clsid2label = {} with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f: for i in f.readlines(): k, v = i.split(": ") clsid2label.setdefault(int(k), v[1:-3]) return clsid2label def preprocess(img: np.array) -> torch.Tensor: """ a preprocess method align with ImageNet dataset Args: img (np.array): input image Returns: torch.Tensor: preprocessed image in `NCHW` layout """ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR) mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) img = (img - mean) / std img = img.transpose(2, 0, 1)[None, ...] return torch.from_numpy(img) if __name__ == "__main__": labels = read_imagenet_labels() img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR) img = preprocess(img) """ NOTE: comment out the model you don't want """ models = [ ("shufflenet_v2_x0_5", shufflenet_v2_x0_5(pretrained=True)), ("shufflenet_v2_x1_0", shufflenet_v2_x1_0(pretrained=True)), ("shufflenet_v2_x1_5", shufflenet_v2_x1_5(pretrained=True)), ("shufflenet_v2_x2_0", shufflenet_v2_x2_0(pretrained=True)), ] for name, model in models: model.eval() with torch.inference_mode(): output = model(img) print(f"{name} result:") for i, batch in enumerate(torch.topk(output, k=3).indices): for j, idx in enumerate(batch): print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}") print(f"{'=' * 32}") with open(f"../models/{name}.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): print("key: ", k) print("value: ", v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") ================================================ FILE: shufflenetv2/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: shufflenetv2/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: shufflenetv2/shufflenetv2.cpp ================================================ #include #include #include #include #include #include #include #include #include "logging.h" #include "utils.h" struct ShuffleNetV2Params { std::array repeat; std::array output_chn; }; /** * @brief choose one below as the model to be built * @param v2_x0_5 * @param v2_x1_0 * @param v2_x1_5 * @param v2_x2_0 */ [[maybe_unused]] static constexpr ShuffleNetV2Params v2_x0_5 = {{4, 8, 4}, {24, 48, 96, 192, 1024}}; [[maybe_unused]] static constexpr ShuffleNetV2Params v2_x1_0 = {{4, 8, 4}, {24, 116, 232, 464, 1024}}; [[maybe_unused]] static constexpr ShuffleNetV2Params v2_x1_5 = {{4, 8, 4}, {24, 176, 352, 704, 1024}}; [[maybe_unused]] static constexpr ShuffleNetV2Params v2_x2_0 = {{4, 8, 4}, {24, 244, 488, 976, 2048}}; constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; // stuff we know about shufflenet-v2 constexpr const int64_t N = 1; constexpr const int32_t INPUT_H = 224; constexpr const int32_t INPUT_W = 224; constexpr const std::array SIZES = {3 * INPUT_H * INPUT_W, 1000}; constexpr const std::array NAMES = {"data", "logits"}; static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false; static constexpr const std::array mean = {0.485f, 0.456f, 0.406f}; static constexpr const std::array stdv = {0.229f, 0.224f, 0.225f}; static constexpr const char* WTS_PATH = "../models/shufflenet_v2_x0_5.wts"; static constexpr const char* ENGINE_PATH = "../models/shufflenet.engine"; static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt"; using namespace nvinfer1; using WeightMap = std::map; using M = MatrixOperation; using NDCF = nvinfer1::NetworkDefinitionCreationFlag; static Logger gLogger; Dims debug_shape(const ILayer* l) { Dims dims = l->getOutput(0)->getDimensions(); std::cout << l->getOutput(0)->getName() << ":\t["; for (int i = 0; i < dims.nbDims; i++) { std::cout << dims.d[i] << ", "; } std::cout << "]\n"; return dims; } ILayer* addBatchNorm2d(INetworkDefinition* network, WeightMap& weightMap, ITensor& input, const std::string& lname, float eps = 1e-3f) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; auto len = weightMap[lname + ".running_var"].count; std::cout << lname << " running_var len: " << len << "\n"; auto* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; auto* shval = static_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; static const Weights power{DataType::kFLOAT, nullptr, 0ll}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } /** * @brief a basic convolution+bn layer with an optional relu layer * * @param network network definition * @param m weight map * @param input input tensor * @param lname layer name * @param ch output channels * @param k kernel * @param s stride * @param p padding * @param g groups * @param with_relu true if with relu * @return ILayer* */ ILayer* CBR(INetworkDefinition* network, WeightMap& m, ITensor& input, const std::string& lname, int ch, int k, int s = 1, int p = 0, int g = 1, bool with_relu = true, int start_index = 0) { static const Weights emptywts{DataType::kFLOAT, nullptr, 0ll}; auto conv_name = lname + "." + std::to_string(start_index++); auto* conv = network->addConvolutionNd(input, ch, DimsHW{k, k}, m[conv_name + ".weight"], emptywts); assert(conv); conv->setStrideNd(DimsHW{s, s}); conv->setPaddingNd(DimsHW{p, p}); conv->setNbGroups(g); conv->setName(conv_name.c_str()); auto bn_name = lname + "." + std::to_string(start_index++); auto* bn = addBatchNorm2d(network, m, *conv->getOutput(0), bn_name, 1e-5f); bn->setName((bn_name + ".bn").c_str()); if (with_relu) { auto* relu = network->addActivation(*bn->getOutput(0), ActivationType::kRELU); auto relu_name = lname + "." + std::to_string(start_index) + ".relu"; assert(relu); relu->setName(relu_name.c_str()); return relu; } return bn; } /** * @brief invered residual block * * @param network network definition * @param m weight map * @param input input tensor * @param lname layer name * @param inch input channels * @param outch output channels * @param s stride * @return ILayer* */ ILayer* invertedRes(INetworkDefinition* net, WeightMap& m, ITensor& input, const std::string& lname, int inch, int outch, int s) { if (s < 1 || s > 3) { std::cerr << "stride must be in [1, 3]\n"; std::abort(); } int32_t bf /* branch features */ = outch / 2; ITensor *x1{nullptr}, *x2{nullptr}; if (s == 1) { auto d = input.getDimensions(); Dims4 stride{1, 1, 1, 1}; Dims4 half{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}; auto* s1 = net->addSlice(input, Dims4{0, 0, 0, 0}, half, stride); auto* s2 = net->addSlice(input, Dims4{0, d.d[1] / 2, 0, 0}, half, stride); debug_shape(s2); x1 = s1->getOutput(0); x2 = s2->getOutput(0); } else { if (s > 1) { auto* b1 = CBR(net, m, input, lname + ".branch1", inch, 3, s, 1, inch, false, 0); b1 = CBR(net, m, *b1->getOutput(0), lname + ".branch1", inch, 1, 1, 0, 1, true, 2); x1 = b1->getOutput(0); debug_shape(b1); } else { x1 = &input; } x2 = &input; } auto* b2 = CBR(net, m, *x2, lname + ".branch2", bf, 1, 1, 0, 1, true, 0); b2 = CBR(net, m, *b2->getOutput(0), lname + ".branch2", bf, 3, s, 1, bf, false, 3); b2 = CBR(net, m, *b2->getOutput(0), lname + ".branch2", bf, 1, 1, 0, 1, true, 5); debug_shape(b2); std::array cat_tensors = {x1, b2->getOutput(0)}; auto* cat = net->addConcatenation(cat_tensors.data(), 2); auto cat_name = lname + ".cat"; assert(cat); cat->setName(cat_name.c_str()); cat->setAxis(1); static_cast(debug_shape(cat)); auto* sf1 = net->addShuffle(*cat->getOutput(0)); assert(sf1); sf1->setName((lname + ".shuffle.1").c_str()); auto d = cat->getOutput(0)->getDimensions(); auto dim_sf1 = Dims{5, {d.d[0], 2, d.d[1] / 2, d.d[2], d.d[3]}}; sf1->setReshapeDimensions(dim_sf1); sf1->setSecondTranspose({0, 2, 1, 3, 4}); auto* sf2 = net->addShuffle(*sf1->getOutput(0)); assert(sf2); sf2->setName((lname + ".shuffle.2").c_str()); sf2->setReshapeDimensions(d); return sf2; } /** * @brief Create a Engine object * * @param N max batch size * @param runtime runtime * @param builder builder * @param config config * @param dt data type * @param param the type of model to be built * @return ICudaEngine* */ ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt, ShuffleNetV2Params param = v2_x0_5) { WeightMap m = loadWeights(WTS_PATH); #if TRT_VERSION >= 11200 auto flag = 1U << static_cast(NDCF::kSTRONGLY_TYPED); #elif TRT_VERSION >= 10000 auto flag = 0U; #else auto flag = 1U << static_cast(NDCF::kEXPLICIT_BATCH); #endif auto* net = builder->createNetworkV2(flag); int32_t in_ch = 3; ITensor* input{nullptr}; if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side dt = DataType::kUINT8; input = net->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, in_ch}); auto* trans = addTransformLayer(net, *input, true, mean, stdv); input = trans->getOutput(0); } else { input = net->addInput(NAMES[0], dt, Dims4{N, in_ch, INPUT_H, INPUT_W}); } assert(input); /** conv1 and maxpool */ auto* cbr1 = CBR(net, m, *input, "conv1", param.output_chn[0], 3, 2, 1); auto* pool1 = net->addPoolingNd(*cbr1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingNd(DimsHW{1, 1}); debug_shape(pool1); /** stage 2, 3, 4 */ ILayer* _layer = pool1; in_ch = param.output_chn[0]; for (int stage = 2; stage < 5; ++stage) { int32_t out_ch = param.output_chn[stage - 1]; std::string lname = "stage" + std::to_string(stage); std::cout << "================ " << lname << " ================\n"; _layer = invertedRes(net, m, *_layer->getOutput(0), lname + ".0", in_ch, out_ch, 2); debug_shape(_layer); for (int j = 1; j < param.repeat[stage - 2]; ++j) { _layer = invertedRes(net, m, *_layer->getOutput(0), lname + "." + std::to_string(j), out_ch, out_ch, 1); } in_ch = out_ch; } /** conv5, mean and fully connected layer */ auto* conv5 = CBR(net, m, *_layer->getOutput(0), "conv5", param.output_chn[4], 1, 1, 0); auto* mean = net->addReduce(*conv5->getOutput(0), ReduceOperation::kAVG, 0xc, false); mean->setName("global_pool(mean)"); auto* fcw = net->addConstant(DimsHW{1000, 1024}, m["fc.weight"]); auto* fcb = net->addConstant(DimsHW{1, 1000}, m["fc.bias"]); auto* _fc = net->addMatrixMultiply(*mean->getOutput(0), M::kNONE, *fcw->getOutput(0), M::kTRANSPOSE); auto* fc = net->addElementWise(*_fc->getOutput(0), *fcb->getOutput(0), ElementWiseOperation::kSUM); fc->getOutput(0)->setName(NAMES[1]); debug_shape(fc); net->markOutput(*fc->getOutput(0)); #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); IHostMemory* mem = builder->buildSerializedNetwork(*net, *config); ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size()); delete net; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); ICudaEngine* engine = builder->buildEngineWithConfig(*net, *config); net->destroy(); #endif std::cout << "build finished\n"; // Release host memory for (auto& mem : m) { free((void*)(mem.second.values)); } return engine; } void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } auto doInference(IExecutionContext& context, void* input, int64_t batchSize) -> std::vector> { ICudaEngine const& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { std::size_t size = 0; #if TRT_VERSION >= 8000 auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); size = s * batchSize * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batchSize * SIZES[i], std::nanf("")); std::size_t size = batchSize * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); // Release stream and buffers CHECK(cudaStreamDestroy(stream)); for (auto& buffer : buffers) { CHECK(cudaFree(buffer)); } return prob; } int main(int argc, char** argv) { checkTrtEnv(); if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./shufflenet -s // serialize model to plan file\n"; std::cerr << "./shufflenet -d // deserialize plan file and run inference\n"; return -1; } // create a model using the API directly and serialize it to a stream IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference void* input = nullptr; std::vector flat_img; cv::Mat img; if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR); input = static_cast(img.data); } else { img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W); input = flat_img.data(); } for (int i = 0; i < 100; ++i) { auto start = std::chrono::system_clock::now(); auto prob = doInference(*context, input, N); auto end = std::chrono::system_clock::now(); auto period = std::chrono::duration_cast(end - start); std::cout << period.count() << "us\n"; for (auto& vector : prob) { int idx = 0; for (auto& v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====\n"; break; } } } if (i == 99) { std::cout << "prediction result:\n"; auto labels = loadImagenetLabelMap(LABELS_PATH); int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << labels[idx] << "\n"; } } } #if TRT_VERSION >= 8000 delete context; delete engine; delete runtime; #else context->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } ================================================ FILE: shufflenetv2/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include "macros.h" using namespace nvinfer1; #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static std::map loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean subtract mean * @param std divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in float32 type */ static std::vector preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array& mean, const std::array& std, int n, int h, int w) { const auto c = img.channels(); const auto size = c * h * w; if (c != 3) { std::cerr << "this demo only supports 3 channel input image.\n"; std::abort(); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); img.convertTo(img, CV_32FC3, 1.f / 255); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(static_cast(n) * c * h * w, 0.f); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = v[0]; chw[i * size + 1 * h * w + y * w + x] = v[1]; chw[i * size + 2 * h * w + y * w + x] = v[2]; } } } return chw; } static auto topk(const std::vector& v, int k) -> std::vector> { if (k <= 0) return {}; auto stride = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(stride); for (auto i = 0; i < stride; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static std::map loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const std::array& mean, const std::array& std) { struct ScaleParams { std::array shift; std::array scale; }; static std::vector> gScaleParams; auto params = std::make_unique(); params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]}; params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)}; static const Weights empty{DataType::kFLOAT, nullptr, 0ll}; const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll}; const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll}; gScaleParams.emplace_back(std::move(params)); ITensor* in = &input; if (input.getType() != DataType::kFLOAT) { #if TRT_VERSION >= 8000 auto* cast = network->addCast(input, DataType::kFLOAT); assert(cast); cast->setName("Cast to FP32"); in = cast->getOutput(0); #else auto* identity = network->addIdentity(input); assert(identity); identity->setName("Convert to FP32"); identity->setOutputType(0, DataType::kFLOAT); in = identity->getOutput(0); #endif } // Convert from NHWC to NCHW auto* perm = network->addShuffle(*in); assert(perm); perm->setName("NHWC -> NCHW"); perm->setFirstTranspose(Permutation{0, 3, 1, 2}); // Convert from BGR to RGB (optional) ITensor* data{nullptr}; if (bgr2rgb) { auto add_slice = [&](int c, const char* name) -> ITensor* { auto dims = perm->getOutput(0)->getDimensions(); Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1}; Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]}; auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride); _slice->setName(name); assert(_slice && _slice->getNbOutputs() == 1); return _slice->getOutput(0); }; std::array channels = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")}; auto* cat = network->addConcatenation(channels.data(), 3); assert(cat); cat->setName("RGB"); cat->setAxis(1); data = cat->getOutput(0); } else { data = perm->getOutput(0); } // Normalize auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty); assert(trans); trans->setName("mean & std"); #if TRT_VERSION >= 8000 trans->setChannelAxis(1); #endif return trans; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: { std::cerr << "Unsupported data type\n"; std::abort(); } } } ================================================ FILE: squeezenet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) project( squeezenet VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 60 70 72 75 80 86 89) endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} "${PROJECT_NAME}.cpp") target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart TensorRT::TensorRT ${OpenCV_LIBS}) ================================================ FILE: squeezenet/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) set(TRT_VERSION $ENV{TRT_VERSION} CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", etc") function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # find TensorRT include folder if(NOT DEFINED TensorRT_INCLUDE_DIR) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") _guess_path( TensorRT_INCLUDE_DIR "NvInfer.h" "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") else() _guess_path( TensorRT_INCLUDE_DIR "NvInfer.h" "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") endif() message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() # find TensorRT library folder if(NOT TensorRT_LIBRARY_DIR) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") _guess_path( TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" "/usr/lib/aarch64-linux-gnu;/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") else() _guess_path( TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" "/usr/lib/x86_64-linux-gnu;/usr/local/tensorrt/targets/x86_64-linux-gnu/lib;/usr/lib" ) endif() message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") endif() set(TensorRT_LIBRARIES) message(STATUS "Found TensorRT lib: ${TensorRT_LIBRARIES}") # process for different TensorRT version if(DEFINED TRT_VERSION AND NOT TRT_VERSION STREQUAL "") string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") set(_modules nvinfer nvinfer_plugin) unset(_match) if(TRT_MAJOR_VERSION GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() else() message(FATAL_ERROR "Please set a environment variable \"TRT_VERSION\"") endif() # find and add all modules of TensorRT into list foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() # make the "TensorRT target" add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) ================================================ FILE: squeezenet/README.md ================================================ # squeezenet v1.1 SqueezeNet 1.1 model from the official SqueezeNet repo SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0, without sacrificing accuracy. For the Pytorch implementation, you can refer to [pytorchx/squeezenet](https://github.com/wang-xinyu/pytorchx/tree/master/squeezenet) ## Usage 1. use `gen_wts.py` to generate wts file ```bash python3 gen_wts.py ``` 2. build C++ code ```bash pushd tensorrtx/squeezenet cmake -S . -B build -G Ninja --fresh cmake --build build ``` 3. serialize wts model to engine file ```bash ./build/squeezenet -s ``` 4. run inference ```bash ./build/squeezenet -d ``` output looks like: ```bash ... ==== Execution time: 183us 3.481, 3.901, 4.438, 4.346, 3.3, 6.519, 6.03, 10.89, 10.45, 10.39, 8.874, 5.889, 9.529, 3.703, 5.865, 6.982, 8.894, 7.76, 4.599, 7.89, 4.795, ==== prediction result: Top: 0 idx: 281, logits: 25.18, label: tabby, tabby cat Top: 1 idx: 282, logits: 23.2, label: tiger cat Top: 2 idx: 309, logits: 22.72, label: bee ``` ================================================ FILE: squeezenet/gen_wts.py ================================================ import struct import cv2 import numpy as np import torch import torchvision def read_imagenet_labels() -> dict[int, str]: """ read ImageNet 1000 labels Returns: dict[int, str]: labels dict """ clsid2label = {} with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f: for i in f.readlines(): k, v = i.split(": ") clsid2label.setdefault(int(k), v[1:-3]) return clsid2label def preprocess(img: np.array) -> torch.Tensor: """ a preprocess method align with ImageNet dataset Args: img (np.array): input image Returns: torch.Tensor: preprocessed image in `NCHW` layout """ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR) mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) img = (img - mean) / std img = img.transpose(2, 0, 1)[None, ...] return torch.from_numpy(img) def main(): labels = read_imagenet_labels() model = torchvision.models.squeezenet1_1(pretrained=True) model = model.eval() img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR) img = preprocess(img) with torch.inference_mode(): output = model(img) for i, batch in enumerate(torch.topk(output, k=3).indices): for j, idx in enumerate(batch): print(f"\tBatch: {i}, Top: {j}, logits: {output[i][idx]:.4f}, label: {labels[int(idx)]}") print(f"{'=' * 32}") with open("../models/squeezenet.wts", "w") as f: f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {} ".format(k, len(vr))) print(k, v.shape) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") f.close() if __name__ == "__main__": main() ================================================ FILE: squeezenet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: squeezenet/macros.h ================================================ #pragma once #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: squeezenet/squeezenet.cpp ================================================ #include #include #include #include #include #include #include #include #include "logging.h" #include "utils.h" // stuff we know about squeezenet static constexpr const int N = 1; static constexpr const int INPUT_H = 224; static constexpr const int INPUT_W = 224; static constexpr const int SIZES[] = {3 * INPUT_H * INPUT_W, N * 1000}; static constexpr const char* NAMES[] = {"data", "prob"}; static constexpr const bool TRT_PREPROCESS = TRT_VERSION >= 8510 ? true : false; static constexpr const float mean[3] = {0.485f, 0.456f, 0.406f}; static constexpr const float stdv[3] = {0.229f, 0.224f, 0.225f}; static constexpr const char* WTS_PATH = "../models/squeezenet.wts"; static constexpr const char* ENGINE_PATH = "../models/squeezenet.engine"; static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt"; using namespace nvinfer1; using WeightMap = std::map; static Logger gLogger; ILayer* fire(INetworkDefinition* network, WeightMap& m, ITensor& input, const std::string& lname, int32_t squeeze_planes, int32_t e1x1_planes, int32_t e3x3_planes) { auto* conv1 = network->addConvolutionNd(input, squeeze_planes, DimsHW{1, 1}, m[lname + "squeeze.weight"], m[lname + "squeeze.bias"]); assert(conv1); auto* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU)->getOutput(0); std::string _c = lname + "expand1x1"; auto* conv2 = network->addConvolutionNd(*relu1, e1x1_planes, DimsHW{1, 1}, m[_c + ".weight"], m[_c + ".bias"]); assert(conv2); auto* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); assert(relu2); _c = lname + "expand3x3"; auto* conv3 = network->addConvolutionNd(*relu1, e3x3_planes, DimsHW{3, 3}, m[_c + ".weight"], m[_c + ".bias"]); assert(conv3); conv3->setPaddingNd(DimsHW{1, 1}); auto* relu3 = network->addActivation(*conv3->getOutput(0), ActivationType::kRELU); assert(relu3); ITensor* inputTensors[] = {relu2->getOutput(0), relu3->getOutput(0)}; auto* concat = network->addConcatenation(inputTensors, 2); assert(concat); return concat; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(int32_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) { auto weightMap = loadWeights(WTS_PATH); #if TRT_VERSION >= 10000 auto* network = builder->createNetworkV2(0); #else auto* network = builder->createNetworkV2(1u << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); #endif ITensor* data{nullptr}; if constexpr (TRT_PREPROCESS) { #if TRT_VERSION > 8510 dt = DataType::kUINT8; #else dt = DataType::kINT8; #endif data = network->addInput(NAMES[0], dt, Dims4{N, INPUT_H, INPUT_W, 3}); auto* trans = addTransformLayer(network, *data, true, mean, stdv); data = trans->getOutput(0); } else { data = network->addInput(NAMES[0], dt, Dims4{N, 3, INPUT_H, INPUT_W}); } assert(data); auto* conv1 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["features.0.weight"], weightMap["features.0.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{2, 2}); auto* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu1); auto* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); pool1->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); auto* cat1 = fire(network, weightMap, *pool1->getOutput(0), "features.3.", 16, 64, 64); cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.4.", 16, 64, 64); auto* pool2 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool2); pool2->setStrideNd(DimsHW{2, 2}); pool2->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); // pool2->setPostPadding(DimsHW{1, 1}); cat1 = fire(network, weightMap, *pool2->getOutput(0), "features.6.", 32, 128, 128); cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.7.", 32, 128, 128); auto* pool3 = network->addPoolingNd(*cat1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool3); pool3->setStrideNd(DimsHW{2, 2}); pool3->setPostPadding(DimsHW{1, 1}); pool3->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP); cat1 = fire(network, weightMap, *pool3->getOutput(0), "features.9.", 48, 192, 192); cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.10.", 48, 192, 192); cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.11.", 64, 256, 256); cat1 = fire(network, weightMap, *cat1->getOutput(0), "features.12.", 64, 256, 256); // classifier auto* conv2 = network->addConvolutionNd(*cat1->getOutput(0), 1000, DimsHW{1, 1}, weightMap["classifier.1.weight"], weightMap["classifier.1.bias"]); assert(conv2); auto* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU); assert(relu2); auto* pool4 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kAVERAGE, DimsHW{14, 14}); assert(pool4); pool4->getOutput(0)->setName(NAMES[1]); network->markOutput(*pool4->getOutput(0)); // Build engine #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); IHostMemory* mem = builder->buildSerializedNetwork(*network, *config); auto* engine = runtime->deserializeCudaEngine(mem->data(), mem->size()); delete network; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); auto* engine = builder->buildEngineWithConfig(*network, *config); network->destroy(); #endif std::cout << "build out" << std::endl; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { // Create builder auto* builder = createInferBuilder(gLogger); auto* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine auto* engine = createEngine(N, runtime, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } std::vector> doInference(IExecutionContext& context, void* input, int32_t batch_size) { const auto& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO); for (auto i = 0; i < nIO; ++i) { std::size_t size = 0; #if TRT_VERSION >= 8000 const auto* tensor_name = engine.getIOTensorName(i); auto s = getSize(engine.getTensorDataType(tensor_name)); size = s * batch_size * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #else const int32_t idx = engine.getBindingIndex(NAMES[i]); auto s = getSize(engine.getBindingDataType(idx)); assert(idx == i); size = s * batch_size * SIZES[i]; CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { std::vector tmp(batch_size * SIZES[i], std::nan("")); std::size_t size = batch_size * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(tmp); } CHECK(cudaStreamSynchronize(stream)); cudaStreamDestroy(stream); for (auto i = 0; i < nIO; ++i) { CHECK(cudaFree(buffers[i])); } return prob; } int main(int argc, char** argv) { checkTrtEnv(); if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./squeezenet -s // serialize model to plan file" << std::endl; std::cerr << "./squeezenet -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream auto* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } #if TRT_VERSION >= 8000 auto* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else auto* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); auto* context = engine->createExecutionContext(); assert(context != nullptr); void* input = nullptr; std::vector flat_img; cv::Mat img; if constexpr (TRT_PREPROCESS) { // for simplicity, resize image on cpu side img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); cv::resize(img, img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR); input = static_cast(img.data); } else { img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); flat_img = preprocess_img(img, true, mean, stdv, N, INPUT_H, INPUT_W); input = flat_img.data(); } for (int32_t i = 0; i < 100; ++i) { auto _start = std::chrono::system_clock::now(); auto prob = doInference(*context, input, N); auto _end = std::chrono::system_clock::now(); auto _time = std::chrono::duration_cast(_end - _start).count(); std::cout << "Execution time: " << _time << "us" << std::endl; for (auto vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====" << std::endl; break; } } } if (i == 99) { std::cout << "prediction result: " << std::endl; auto labels = loadImagenetLabelMap(LABELS_PATH); int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << labels[idx] << std::endl; } } } delete[] trtModelStream; // Destroy the engine #if TRT_VERSION >= 8000 delete context; delete engine; delete runtime; #else context->destroy(); engine->destroy(); runtime->destroy(); #endif return 0; } ================================================ FILE: squeezenet/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include using namespace nvinfer1; #define WORKSPACE_SIZE (16 << 20) #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << std::endl; \ abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { throw std::runtime_error("TensorRT < 8 does not support SM > 86 on this GPU."); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static std::map loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * wt.count)); for (uint32_t x = 0; x < wt.count; ++x) { input >> std::hex >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean subtract mean * @param std divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in float32 type */ static std::vector preprocess_img(cv::Mat& img, bool bgr2rgb, const float mean[3], const float std[3], int n, int h, int w) { const int c = img.channels(); const std::size_t size = c * h * w; if (c != 3) { throw std::runtime_error("this demo only supports 3 channel input image."); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); img.convertTo(img, CV_32FC3, 1.f / 255); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(n * c * h * w, 0.f); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = v[0]; chw[i * size + 1 * h * w + y * h + x] = v[1]; chw[i * size + 2 * h * w + y * h + x] = v[2]; } } } return chw; } static std::vector> topk(const std::vector& v, int k) { if (k <= 0) return {}; k = std::min(k, v.size()); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), idx.begin() + k, idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(k); for (int i = 0; i < k; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static std::map loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } static ILayer* addTransformLayer(INetworkDefinition* network, ITensor& input, bool bgr2rgb, const float mean[3], const float std[3]) { struct ScaleParams { std::array shift; std::array scale; }; static std::vector> gScaleParams; auto params = std::make_unique(); params->shift = {-mean[0] / std[0], -mean[1] / std[1], -mean[2] / std[2]}; params->scale = {1.f / (std[0] * 255.f), 1.f / (std[1] * 255.f), 1.f / (std[2] * 255.f)}; static const Weights empty{DataType::kFLOAT, nullptr, 0ll}; const Weights shift{DataType::kFLOAT, params->shift.data(), 3ll}; const Weights scale{DataType::kFLOAT, params->scale.data(), 3ll}; gScaleParams.emplace_back(std::move(params)); ITensor* in = &input; if (input.getType() != DataType::kFLOAT) { #if TRT_VERSION >= 8000 auto* cast = network->addCast(input, DataType::kFLOAT); assert(cast); cast->setName("Cast to FP32"); in = cast->getOutput(0); #else auto* identity = network->addIdentity(input); assert(identity); identity->setName("Convert to FP32"); identity->setOutputType(0, DataType::kFLOAT); in = identity->getOutput(0); #endif } // Convert from NHWC to NCHW auto* perm = network->addShuffle(*in); assert(perm); perm->setName("NHWC -> NCHW"); perm->setFirstTranspose(Permutation{0, 3, 1, 2}); // Convert from BGR to RGB (optional) ITensor* data{nullptr}; if (bgr2rgb) { auto add_slice = [&](int c, const char* name) -> ITensor* { auto dims = perm->getOutput(0)->getDimensions(); Dims4 start = {0, c, 0, 0}, stride = {1, 1, 1, 1}; Dims4 size = {dims.d[0], 1, dims.d[2], dims.d[3]}; auto* _slice = network->addSlice(*perm->getOutput(0), start, size, stride); _slice->setName(name); assert(_slice && _slice->getNbOutputs() == 1); auto d = _slice->getOutput(0)->getDimensions(); return _slice->getOutput(0); }; ITensor* channels[] = {add_slice(2, "R"), add_slice(1, "G"), add_slice(0, "B")}; auto* cat = network->addConcatenation(channels, 3); assert(cat); cat->setName("RGB"); cat->setAxis(1); data = cat->getOutput(0); } else { data = perm->getOutput(0); } // Normalize auto* trans = network->addScale(*data, ScaleMode::kCHANNEL, shift, scale, empty); assert(trans); trans->setName("mean & std"); #if TRT_VERSION >= 8000 trans->setChannelAxis(1); #endif return trans; } static size_t getSize(DataType dt) { switch (dt) { #if TRT_VERSION >= 8510 case DataType::kUINT8: #endif case DataType::kINT8: return sizeof(int8_t); case DataType::kFLOAT: return sizeof(float); case DataType::kHALF: return sizeof(int16_t); case DataType::kINT32: return sizeof(int32_t); default: throw std::runtime_error("Unsupported data type"); } } ================================================ FILE: superpoint/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(SuperPointNet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(supernet ${PROJECT_SOURCE_DIR}/supernet.cpp ${PROJECT_SOURCE_DIR}/utils.cpp) target_link_libraries(supernet nvinfer) target_link_libraries(supernet cudart) target_link_libraries(supernet ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: superpoint/README.md ================================================ # SuperPoint The PyTorch implementation is from [magicleap/SuperPointPretrainedNetwork.](https://github.com/magicleap/SuperPointPretrainedNetwork) The pretrained models are from [magicleap/SuperPointPretrainedNetwork.](https://github.com/magicleap/SuperPointPretrainedNetwork) ## Config - FP16/FP32 can be selected by the macro `USE_FP16` in supernet.cpp - GPU id and batch size can be selected by the macro `DEVICE` & `BATCH_SIZE` in supernet.cpp ## How to Run 1.Generate .wts file from the baseline pytorch implementation of pretrained model. The following example described how to generate superpoint_v1.wts from pytorch implementation of superpoint_v1. ``` git clone https://github.com/xiang-wuu/SuperPointPretrainedNetwork cd SuperPointPretrainedNetwork git checkout deploy // copy tensorrtx/superpoint/gen_wts.py to here(SuperPointPretrainedNetwork) python gen_wts.py // a file 'superpoint_v1.wts' will be generated. // before running gen_wts.py python script make sure you cloned private fork and checkout to deploy branch. ``` 2.Put .wts file into tensorrtx/superpoint, build and run ``` cd tensorrtx/superpoint mkdir build cd build cmake .. make ./supernet -s SuperPointPretrainedNetwork/superpoint_v1.wts // serialize model to plan file i.e. 'supernet.engine' ``` ## Run Demo using SuperPointPretrainedNetwork Python Script The live demo can be run by inffering TensorRT generated engine file or by the pre-trained pytorch weight file , the `demo_superpoint.py` script is modified to infer automatically by either using TensorRT or PyTorch based on the provided input weight file. ``` cd SuperPointPretrainedNetwork python demo_superpoint.py assets/nyu_snippet.mp4 --cuda --weights_path tensorrtx/superpoint/build/supernet.engine // provide absolute path to supernet.engine as input weight file python demo_superpoint.py assets/nyu_snippet.mp4 --cuda --weights_path superpoint_v1.pth // execute above command to infer using pytorch pre-trained weight files instead of tensorrt engine file. ``` ## Output As from the below result there is no significant difference in the inferred output!
PyTorch TensorRT
## TODO - [ ] Optimizing post-processing using custom TensorRT layer. - [ ] Benchmark validation for speed accuracy tradeoff with [hpatches](https://github.com/hpatches/hpatches-benchmark) dataset ================================================ FILE: superpoint/gen_wts.py ================================================ import torch import struct from model import SuperPointNet model_name = "superpoint_v1" net = SuperPointNet() net.load_state_dict(torch.load("superpoint_v1.pth")) net = net.cuda() net.eval() f = open(model_name + ".wts", "w") f.write("{}\n".format(len(net.state_dict().keys()))) for k, v in net.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") ================================================ FILE: superpoint/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream &stream, const std::string &prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer &&other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm *tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream &mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream &stream, const std::string &prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer &&other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream &severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger &getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char *msg) noexcept override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom &&) = default; private: friend class Logger; TestAtom(bool started, const std::string &name, const std::string &cmdline) : mStarted(started), mName(name), mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string &name, const std::string &cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string &name, int argc, char const *const *argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom &testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom &testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom &testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom &testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom &testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom &testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char *severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char *testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream &severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom &testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const *const *argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger &logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: superpoint/supernet.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "utils.h" #include "cuda_runtime_api.h" #include "logging.h" //#define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 // currently, only support BATCH=1 // stuff we know about the network and the input/output blobs static const int INPUT_H = 120; static const int INPUT_W = 160; const char *INPUT_BLOB_NAME = "data"; const char *OUTPUT_BLOB_NAME_1 = "semi"; const char *OUTPUT_BLOB_NAME_2 = "desc"; static Logger gLogger; // create the engine using only the API and not any parser. ICudaEngine *createEngine(IBuilder *builder, IBuilderConfig *config, std::string path, DataType dt) { INetworkDefinition *network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights(path); IConvolutionLayer *conv1a = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["conv1a.weight"], weightMap["conv1a.bias"]); assert(conv1a); conv1a->setStrideNd(DimsHW{1, 1}); conv1a->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu1 = network->addActivation(*conv1a->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer *conv1b = network->addConvolutionNd(*relu1->getOutput(0), 64, DimsHW{3, 3}, weightMap["conv1b.weight"], weightMap["conv1b.bias"]); assert(conv1b); conv1b->setStrideNd(DimsHW{1, 1}); conv1b->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu2 = network->addActivation(*conv1b->getOutput(0), ActivationType::kRELU); assert(relu2); IPoolingLayer *pool1 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); IConvolutionLayer *conv2a = network->addConvolutionNd(*pool1->getOutput(0), 64, DimsHW{3, 3}, weightMap["conv2a.weight"], weightMap["conv2a.bias"]); assert(conv2a); conv2a->setStrideNd(DimsHW{1, 1}); conv2a->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu3 = network->addActivation(*conv2a->getOutput(0), ActivationType::kRELU); assert(relu3); IConvolutionLayer *conv2b = network->addConvolutionNd(*relu3->getOutput(0), 64, DimsHW{3, 3}, weightMap["conv2b.weight"], weightMap["conv2b.bias"]); assert(conv2b); conv2b->setStrideNd(DimsHW{1, 1}); conv2b->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu4 = network->addActivation(*conv2b->getOutput(0), ActivationType::kRELU); assert(relu4); IPoolingLayer *pool2 = network->addPoolingNd(*relu4->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool2); pool2->setStrideNd(DimsHW{2, 2}); IConvolutionLayer *conv3a = network->addConvolutionNd(*pool2->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv3a.weight"], weightMap["conv3a.bias"]); assert(conv3a); conv3a->setStrideNd(DimsHW{1, 1}); conv3a->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu44 = network->addActivation(*conv3a->getOutput(0), ActivationType::kRELU); assert(relu44); IConvolutionLayer *conv3b = network->addConvolutionNd(*relu44->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv3b.weight"], weightMap["conv3b.bias"]); assert(conv3b); conv3b->setStrideNd(DimsHW{1, 1}); conv3b->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu5 = network->addActivation(*conv3b->getOutput(0), ActivationType::kRELU); assert(relu5); IPoolingLayer *pool3 = network->addPoolingNd(*relu5->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool3); pool3->setStrideNd(DimsHW{2, 2}); IConvolutionLayer *conv4a = network->addConvolutionNd(*pool3->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv4a.weight"], weightMap["conv4a.bias"]); assert(conv4a); conv4a->setStrideNd(DimsHW{1, 1}); conv4a->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu6 = network->addActivation(*conv4a->getOutput(0), ActivationType::kRELU); assert(relu6); IConvolutionLayer *conv4b = network->addConvolutionNd(*relu6->getOutput(0), 128, DimsHW{3, 3}, weightMap["conv4b.weight"], weightMap["conv4b.bias"]); assert(conv4b); conv4b->setStrideNd(DimsHW{1, 1}); conv4b->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu7 = network->addActivation(*conv4b->getOutput(0), ActivationType::kRELU); assert(relu7); IConvolutionLayer *convPa = network->addConvolutionNd(*relu7->getOutput(0), 256, DimsHW{3, 3}, weightMap["convPa.weight"], weightMap["convPa.bias"]); assert(convPa); convPa->setStrideNd(DimsHW{1, 1}); convPa->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu8 = network->addActivation(*convPa->getOutput(0), ActivationType::kRELU); assert(relu8); IConvolutionLayer *convPb = network->addConvolutionNd(*relu8->getOutput(0), 65, DimsHW{1, 1}, weightMap["convPb.weight"], weightMap["convPb.bias"]); assert(convPb); convPb->setStrideNd(DimsHW{1, 1}); IConvolutionLayer *convDa = network->addConvolutionNd(*relu7->getOutput(0), 256, DimsHW{3, 3}, weightMap["convDa.weight"], weightMap["convDa.bias"]); assert(convDa); convDa->setStrideNd(DimsHW{1, 1}); convDa->setPaddingNd(DimsHW{1, 1}); IActivationLayer *relu9 = network->addActivation(*convDa->getOutput(0), ActivationType::kRELU); assert(relu9); IConvolutionLayer *convDb = network->addConvolutionNd(*relu9->getOutput(0), 256, DimsHW{1, 1}, weightMap["convDb.weight"], weightMap["convDb.bias"]); assert(convDb); convDb->setStrideNd(DimsHW{1, 1}); convPb->getOutput(0)->setName(OUTPUT_BLOB_NAME_1); std::cout << "set name out1" << std::endl; network->markOutput(*convPb->getOutput(0)); convDb->getOutput(0)->setName(OUTPUT_BLOB_NAME_2); std::cout << "set name out2" << std::endl; network->markOutput(*convDb->getOutput(0)); // Build engine builder->setMaxBatchSize(BATCH_SIZE); config->setMaxWorkspaceSize(1 << 20); #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto &mem : weightMap) { free((void *)(mem.second.values)); } return engine; } // Creat the engine using only the API and not any parser. void APIToModel(std::string path, IHostMemory **modelStream) { // Create builder IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = createEngine(builder, config, path, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } int main(int argc, char **argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 3 && std::string(argv[1]) == "-s") { IHostMemory *modelStream{nullptr}; APIToModel(std::string(argv[2]), &modelStream); assert(modelStream != nullptr); std::ofstream p("supernet.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./supernet -s // serialize model to plan file" << std::endl; return -1; } return 0; } ================================================ FILE: superpoint/utils.cpp ================================================ #include "utils.h" #include #include // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent *p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { // std::string cur_file_name(p_dir_name); // cur_file_name += "/"; // cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } void tokenize(const std::string &str, std::vector &tokens, const std::string &delimiters) { // Skip delimiters at beginning. std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); // Find first non-delimiter. std::string::size_type pos = str.find_first_of(delimiters, lastPos); while (std::string::npos != pos || std::string::npos != lastPos) { // Found a token, add it to the vector. tokens.push_back(str.substr(lastPos, pos - lastPos)); // Skip delimiters. lastPos = str.find_first_not_of(delimiters, pos); // Find next non-delimiter. pos = str.find_first_of(delimiters, lastPos); } } ================================================ FILE: superpoint/utils.h ================================================ #pragma once #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "assert.h" #include #include #include #include #include using namespace nvinfer1; #define CHECK(status) \ do \ { \ auto ret = (status); \ if (ret != 0) \ { \ std::cout << "Cuda failure: " << ret; \ abort(); \ } \ } while (0) int read_files_in_dir(const char *p_dir_name, std::vector &file_names); std::map loadWeights(const std::string file); void tokenize(const std::string &str, std::vector &tokens, const std::string &delimiters = ","); ================================================ FILE: swin-transformer/semantic-segmentation/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.4) project(swintransformer) set(OpenCV_DIR "D:\\opencv\\opencv346\\build") set(TENSORRT_DIR "D:\\TensorRT-7.0.0.11.Windows10.x86_64.cuda-10.2.cudnn7.6\\TensorRT-7.0.0.11") add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED") if(WIN32) include_directories(${PROJECT_SOURCE_DIR}/include) endif(WIN32) find_package(CUDA REQUIRED) message(STATUS " libraries: ${CUDA_LIBRARIES}") message(STATUS " include path: ${CUDA_INCLUDE_DIRS}") include_directories(${CUDA_INCLUDE_DIRS}) set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11; -g; -G;-gencode; arch=compute_75;code=sm_75) enable_language(CUDA) # һӺ ͻvsвҪֶcuda include_directories(${TENSORRT_DIR}\\include) link_directories(${TENSORRT_DIR}\\lib) # file(GLOB SOURCE_FILES "*.cu") # cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/API.h) # target_link_libraries(myplugins nvinfer cudart) # opencvϢ find_package(OpenCV QUIET NO_MODULE NO_DEFAULT_PATH NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_SYSTEM_ENVIRONMENT_PATH NO_CMAKE_PACKAGE_REGISTRY NO_CMAKE_BUILDS_PATH NO_CMAKE_SYSTEM_PATH NO_CMAKE_SYSTEM_PACKAGE_REGISTRY ) message(STATUS "OpenCV library status:") message(STATUS " version: ${OpenCV_VERSION}") message(STATUS " libraries: ${OpenCV_LIBS}") message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB SOURCE_FILES "*.h" "*.cpp" "*.cu") add_executable(swintransformer ${SOURCE_FILES}) target_link_libraries(swintransformer nvinfer nvonnxparser) target_link_libraries(swintransformer cudart) target_link_libraries(swintransformer ${OpenCV_LIBS}) # if (WIN32) # message(STATUS "copy dll......: ${CMAKE_COMMAND} ${TENSORRT_DIR}") # add_custom_command(TARGET swintransformer POST_BUILD # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/myelin64_1.dll ./${CMAKE_BUILD_TYPE}/myelin64_1.dll # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvinfer.dll ./${CMAKE_BUILD_TYPE}/nvinfer.dll # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvinfer_plugin.dll ./${CMAKE_BUILD_TYPE}/nvinfer_plugin.dll # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvonnxparser.dll ./${CMAKE_BUILD_TYPE}/nvonnxparser.dll # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvparsers.dll ./${CMAKE_BUILD_TYPE}/nvparsers.dll # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${TENSORRT_DIR}/lib/nvserialize.dll ./${CMAKE_BUILD_TYPE}/nvserialize.dll # COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUDA_TOOLKIT_ROOT_DIR}/bin/cublas64_10.dll ./${CMAKE_BUILD_TYPE}/cublas64_10.dll # ) # endif(WIN32) if(UNIX) add_definitions(-O2 -pthread) endif(UNIX) ================================================ FILE: swin-transformer/semantic-segmentation/README.md ================================================ # Swin Transform - Semantic Segmentation The Pytorch implementation is [microsoft/Swin-Transformer](https://github.com/microsoft/Swin-Transformer.git). Only support Swin-T, welcome the PR for other backbones. ## Authors ## How to Run 1. generate .wts from pytorch with .pt, or download .wts from model zoo ``` git clone https://github.com/microsoft/Swin-Transformer.git git clone https://github.com/wang-xinyu/tensorrtx.git python gen_wts.py Swin-Transform.pt // a file 'Swin-Transform.wts' will be generated. ``` 2. build tensorrtx/swin-transform and run ``` cd {tensorrtx}/swin-transform/semantic-segmentation/ mkdir build cd build cp {microsoft}/Swin-Transformer/Swin-Transform.wts {tensorrtx}/swin-transformer/semantic-segmentation/build cmake .. make sudo ./swintransformer -s [.wts] [.engine] // serialize model to plan file sudo ./swintransformer -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. ``` ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: swin-transformer/semantic-segmentation/UpsampleKernel.cu ================================================ #include "UpsmapleKernel.h" /** * @brief caculate the number of cuda kernel for upsample. (Cite from: 《GPU高性能编程CUDA实战》P46,P47) * * @param total_thread_num: the number of cuda thread of you want to used for upsample * @param max_thread_num: the gpu device property * @return int the number of cuda kernel for upsample */ int get_kernel_num(int total_thread_num, int max_thread_num) { return (total_thread_num + max_thread_num - 1)/max_thread_num; } int get_max_thread_num() { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); return prop.maxThreadsPerBlock; } __host__ __forceinline__ float linear_upsampling_compute_scale(int input_size, int output_size) { return float(input_size)/float(output_size) ; } __device__ __forceinline__ float linear_upsampling_compute_source_index(float scale, int dst_index, int intput_size) { float src_idx = scale * (dst_index + 0.5)-0.5; return (src_idx>=0) ? src_idx : 0; } __device__ __forceinline__ int get_index(const int batch_idx, const int channel_idx, const int height_idx, const int width_idx, const int batch_total, const int channel_total, const int width) { int ret_idx = batch_idx * batch_total + channel_idx * channel_total + height_idx * width + width_idx; return ret_idx; } /** * @brief * * @tparam T * @param n * @param input_shape: input data shape. such as [batch, channel, height, width] * @param rate_h * @param rate_w * @param inputs * @param outputs * @return __global__ BilinearKernel * @TODO: * */ template __global__ void BilinearKernel( const int n, int input_b, int input_c, int input_h, int input_w, int output_h, int output_w, const float rate_h, const float rate_w, const T* inputs, T* outputs) { int index = threadIdx.x + blockIdx.x * blockDim.x; if(index < n) { const int w2 = index % output_w; const int h2 = index / output_w; const float h1r = linear_upsampling_compute_source_index(rate_h, h2, input_h); const int h1 = int(h1r); const int h1p = (h1 < input_h - 1) ? 1 : 0; const float h1lambda = h1r - h1; const float h0lambda = 1 - h1lambda; const float w1r = linear_upsampling_compute_source_index(rate_w, w2, input_w); const int w1 = int(w1r); const int w1p = (w1 < input_w - 1) ? 1 : 0; const float w1lambda = w1r - w1; const float w0lambda = 1 - w1lambda; int s_batch_total_1 = input_c * input_h * input_w; int s_channel_total_1 = input_h * input_w; int s_batch_total_2 = input_c * output_h * output_w; int s_channel_total_2 = output_h * output_w; const int batch_size = input_b; const int channel_size = input_c; for(int b_idx=0; b_idx<<< kernel_num, max_threads, 0, stream>>>(n,input_b,input_c,input_h,input_w, output_h, output_w, rate_h, rate_w, static_cast(inputs), static_cast(outputs)); return 0; } ================================================ FILE: swin-transformer/semantic-segmentation/UpsamplePlugin.cpp ================================================ #include #include "UpsmapleKernel.h" #include "UpsamplePlugin.h" #include #include using namespace nvinfer1; // Upsample plugin specific constants namespace { static const char* UPSAMPLE_PLUGIN_VERSION{"1"}; static const char* UPSAMPLE_PLUGIN_NAME{"UpsamplePlugin"}; } // Static class fields initialization PluginFieldCollection UpsamplePluginCreator::mFC{}; std::vector UpsamplePluginCreator::mPluginAttributes; REGISTER_TENSORRT_PLUGIN(UpsamplePluginCreator); template void writeToBuffer(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } // Helper function for deserializing plugin template T readFromBuffer(const char*& buffer) { T val = *reinterpret_cast(buffer); buffer += sizeof(T); return val; } UpsamplePlugin::UpsamplePlugin(const std::string name, float scale_h, float scale_w) : mLayerName(name) , mScaleFactor_h(scale_h) , mScaleFactor_w(scale_w) { mInputShape.c() = -1; mInputShape.h() = -1; mInputShape.w() = -1; mInputVolume = 0; } UpsamplePlugin::UpsamplePlugin(const std::string name, const void* data, size_t length) : mLayerName(name) { const char *d = static_cast(data); const char *a = d; mScaleFactor_h = readFromBuffer(d); mScaleFactor_w = readFromBuffer(d); mInputVolume = readFromBuffer(d); mInputShape.c() = readFromBuffer(d); mInputShape.h() = readFromBuffer(d); mInputShape.w() = readFromBuffer(d); assert(d == (a + length)); } const char* UpsamplePlugin::getPluginType() const { return UPSAMPLE_PLUGIN_NAME; } const char* UpsamplePlugin::getPluginVersion() const { return UPSAMPLE_PLUGIN_VERSION; } int UpsamplePlugin::getNbOutputs() const { return 1; } Dims UpsamplePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { assert(index == 0); assert(nbInputDims == 1); assert(inputs[0].nbDims == 3); return nvinfer1::DimsCHW{inputs[0].d[0],int(inputs[0].d[1]*mScaleFactor_h), int(inputs[0].d[2]*mScaleFactor_w)}; } int UpsamplePlugin::initialize() { //printf("UpsamplePlugin::initialize\n"); return 0; } int UpsamplePlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) { //printf("UpsamplePlugin::enqueue\n"); int status = -1; // Our plugin outputs only one tensor void* output = outputs[0]; // Launch CUDA kernel wrapper and save its return value status = UpsampleInference(stream, mInputVolume, batchSize, mInputShape.c(), mInputShape.h(), mInputShape.w(), mScaleFactor_h,mScaleFactor_w, inputs[0], output); return status; } size_t UpsamplePlugin::getSerializationSize() const { //printf("UpsamplePlugin::getSerializationSize\n"); return sizeof(mScaleFactor_h) + sizeof(mScaleFactor_w) + sizeof(mInputVolume) + sizeof(mInputShape.c()) + sizeof(mInputShape.h()) + sizeof(mInputShape.w()); } void UpsamplePlugin::serialize(void* buffer) const { //printf("UpsamplePlugin::serialize\n"); char *d = static_cast(buffer); const char *a = d; writeToBuffer(d, mScaleFactor_h); writeToBuffer(d, mScaleFactor_w); writeToBuffer(d, mInputVolume); writeToBuffer(d, mInputShape.c()); writeToBuffer(d, mInputShape.h()); writeToBuffer(d, mInputShape.w()); assert(d == a + getSerializationSize()); } void UpsamplePlugin::configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, DataType type, PluginFormat format, int) { assert(nbOutputs == 1); assert(type == DataType::kFLOAT); assert(format == PluginFormat::kNCHW); assert(inputs[0].nbDims == 3); size_t volume = int(inputs[0].d[1]*mScaleFactor_h) * int(inputs[0].d[2]*mScaleFactor_w); mInputVolume = volume; mInputShape.c() = inputs[0].d[0]; mInputShape.h() = inputs[0].d[1]; mInputShape.w() = inputs[0].d[2]; } bool UpsamplePlugin::supportsFormat(DataType type, PluginFormat format) const { if (type == DataType::kFLOAT && format == PluginFormat::kNCHW) return true; else return false; } void UpsamplePlugin::terminate() {} void UpsamplePlugin::destroy() { // This gets called when the network containing plugin is destroyed delete this; } IPluginV2* UpsamplePlugin::clone() const { return new UpsamplePlugin(mLayerName, mScaleFactor_h, mScaleFactor_w); } void UpsamplePlugin::setPluginNamespace(const char* libNamespace) { mNamespace = libNamespace; } const char* UpsamplePlugin::getPluginNamespace() const { return mNamespace.c_str(); } UpsamplePluginCreator::UpsamplePluginCreator() { mPluginAttributes.emplace_back(PluginField("scaleFactor", nullptr, PluginFieldType::kFLOAT32, 2)); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* UpsamplePluginCreator::getPluginName() const { return UPSAMPLE_PLUGIN_NAME; } const char* UpsamplePluginCreator::getPluginVersion() const { return UPSAMPLE_PLUGIN_VERSION; } const PluginFieldCollection* UpsamplePluginCreator::getFieldNames() { return &mFC; } IPluginV2* UpsamplePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { float scaleFactor_h = 0.f; float scaleFactor_w = 0.f; const PluginField* fields = fc->fields; assert(fc->nbFields == 1); for (int i = 0; i < fc->nbFields; i++){ if (strcmp(fields[i].name, "scaleFactor") == 0) { assert(fields[i].type == PluginFieldType::kFLOAT32); scaleFactor_h = *(static_cast(fields[i].data)); scaleFactor_w = *(static_cast(fields[i].data)+1); //std::cout< #include using namespace nvinfer1; class UpsamplePlugin : public IPluginV2 { public: UpsamplePlugin(const std::string name, float scale_h,float scale_w); UpsamplePlugin(const std::string name, const void* data, size_t length); // It doesn't make sense to make UpsamplePlugin without arguments, so we delete default constructor. UpsamplePlugin() = delete; int getNbOutputs() const override; Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; void terminate() override; size_t getWorkspaceSize(int) const override { return 0; }; int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override; size_t getSerializationSize() const override; void serialize(void* buffer) const override; void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override; bool supportsFormat(DataType type, PluginFormat format) const override; const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; nvinfer1::IPluginV2* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; private: const std::string mLayerName; bool mAlignCorners; float mScaleFactor_h; float mScaleFactor_w; size_t mInputVolume; DimsCHW mInputShape; std::string mNamespace; }; class UpsamplePluginCreator : public IPluginCreator { public: UpsamplePluginCreator(); const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; private: static PluginFieldCollection mFC; static std::vector mPluginAttributes; std::string mNamespace; }; #endif ================================================ FILE: swin-transformer/semantic-segmentation/UpsmapleKernel.h ================================================ #ifndef UPSAMPLE_KERNEL_H #define UPSAMPLE_KERNEL_H #include #include "NvInfer.h" int UpsampleInference( cudaStream_t stream, int n, int input_b, int input_c, int input_h, int input_w, float scale_h, float scale_w, const void* inputs, void* outputs); #endif ================================================ FILE: swin-transformer/semantic-segmentation/common.hpp ================================================ #ifndef COMMON_HPP #define COMMON_HPP #include "layerNorm.h" #include "NvInfer.h" #include "NvInfer.h" #include "NvInferPlugin.h" #include "cuda_runtime_api.h" #include #include #include #include #include #include #include using namespace nvinfer1; #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) void mblobFromImages(cv::InputArrayOfArrays images_, cv::OutputArray blob_, cv::Size size, const cv::Scalar& mean_, const cv::Scalar& std_, bool swapRB, bool crop) { //CV_TRACE_FUNCTION(); std::vector images; images_.getMatVector(images); CV_Assert(!images.empty()); for (int i = 0; i < images.size(); i++) { cv::Size imgSize = images[i].size(); if (size == cv::Size()) size = imgSize; if (size != imgSize) { if (crop) { float resizeFactor = std::max(size.width / (float)imgSize.width, size.height / (float)imgSize.height); resize(images[i], images[i], cv::Size(), resizeFactor, resizeFactor, cv::INTER_LINEAR); cv::Rect crop(cv::Point(0.5 * (images[i].cols - size.width), 0.5 * (images[i].rows - size.height)), size); images[i] = images[i](crop); } else resize(images[i], images[i], size, 0, 0, cv::INTER_LINEAR); } if (images[i].depth() == CV_8U) images[i].convertTo(images[i], CV_32F); cv::Scalar mean = mean_; cv::Scalar std_num = std_; if (swapRB) { std::swap(mean[0], mean[2]); std::swap(std_num[0], std_num[2]); } images[i] -= mean; images[i] /= std_num; } size_t i, nimages = images.size(); cv::Mat image0 = images[0]; int nch = image0.channels(); CV_Assert(image0.dims == 2); cv::Mat image; if (nch == 3 || nch == 4) { int sz[] = { (int)nimages, nch, image0.rows, image0.cols }; blob_.create(4, sz, CV_32F); cv::Mat blob = blob_.getMat(); cv::Mat ch[4]; for (i = 0; i < nimages; i++) { image = images[i]; CV_Assert(image.depth() == CV_32F); nch = image.channels(); CV_Assert(image.dims == 2 && (nch == 3 || nch == 4)); CV_Assert(image.size() == image0.size()); for (int j = 0; j < nch; j++) ch[j] = cv::Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, j)); if (swapRB) std::swap(ch[0], ch[2]); split(image, ch); } } else { CV_Assert(nch == 1); int sz[] = { (int)nimages, 1, image0.rows, image0.cols }; blob_.create(4, sz, CV_32F); cv::Mat blob = blob_.getMat(); for (i = 0; i < nimages; i++) { cv::Mat image = images[i]; CV_Assert(image.depth() == CV_32F); nch = image.channels(); CV_Assert(image.dims == 2 && (nch == 1)); CV_Assert(image.size() == image0.size()); image.copyTo(cv::Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, 0))); } } } cv::Mat BlobFromImages(cv::InputArrayOfArrays images, cv::Size size, const cv::Scalar& mean, const cv::Scalar& std_num, bool swapRB, bool crop) { //CV_TRACE_FUNCTION(); cv::Mat blob; mblobFromImages(images, blob, size, mean, std_num, swapRB, crop); return blob; } void debug_print(ITensor *input_tensor,std::string head) { std::cout << head<< " : "; for (int i = 0; i < input_tensor->getDimensions().nbDims; i++) { std::cout << input_tensor->getDimensions().d[i] << " "; } std::cout< loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } ITensor* m_layerNorm(INetworkDefinition *m_Network,std::map weightMap,ITensor *input, string lname) { auto creator = getPluginRegistry()->getPluginCreator("layerNorm_trt","1"); PluginField pluginMultidata[2]; const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin(lname.c_str(), pluginData); ITensor* inputTensors[] = {input}; auto ln_ms = m_Network->addPluginV2(inputTensors, 1, *pluginObj); auto ln_m = m_Network->addElementWise(*input,*ln_ms->getOutput(0),ElementWiseOperation::kSUB); auto ln = m_Network->addElementWise(*ln_m->getOutput(0),*ln_ms->getOutput(1),ElementWiseOperation::kDIV); Weights W = weightMap[lname + ".weight"]; int len = W.count; Dims wb ; wb.nbDims = ln->getOutput(0)->getDimensions().nbDims; for (int i = 0 ; i < wb.nbDims; i++) { if (i != wb.nbDims -1) wb.d[i] = 1; else{ wb.d[i] = len; } } auto wgts = m_Network->addConstant(wb,W); auto p_w = m_Network->addElementWise(*ln->getOutput(0),*wgts->getOutput(0),ElementWiseOperation::kPROD); Weights B = weightMap[lname + ".bias"]; auto bias = m_Network->addConstant(wb,B); auto sum_bias = m_Network->addElementWise(*p_w->getOutput(0),*bias->getOutput(0),ElementWiseOperation::kSUM); debug_print(sum_bias->getOutput(0),lname); return sum_bias->getOutput(0); } ITensor* layerNorm(INetworkDefinition *m_Network,std::map weightMap,ITensor *input, string lname) { auto mean = m_Network->addReduce(*input, ReduceOperation::kAVG, 2, true); assert(mean); auto sub_mean = m_Network->addElementWise(*input, *mean->getOutput(0), ElementWiseOperation::kSUB); assert(sub_mean); // float SCALING_ONE = 1.0; // float SHIFT_ZERO = 0.0; // float POWER_TWO = 2.0; // // implement pow2 with scale // Weights scale{ DataType::kFLOAT, &SCALING_ONE, 1 }; // Weights shift{ DataType::kFLOAT, &SHIFT_ZERO, 1 }; // Weights power{ DataType::kFLOAT, &POWER_TWO, 1 }; // auto pow2 = m_Network->addScaleNd(*sub_mean->getOutput(0), ScaleMode::kUNIFORM, shift, scale, power,0); // assert(pow2); auto pow2 = m_Network->addElementWise(*sub_mean->getOutput(0), *sub_mean->getOutput(0), ElementWiseOperation::kPROD); assert(pow2); debug_print(pow2->getOutput(0),"pow2"); auto pow_mean = m_Network->addReduce(*pow2->getOutput(0), ReduceOperation::kAVG, 2, true); assert(pow_mean); debug_print(pow_mean->getOutput(0),"pow_mean"); float E = 1e-5; Weights EPS{DataType::kFLOAT,nullptr,1}; EPS.values = &E; auto eps = m_Network->addConstant(Dims2{1,1}, EPS); assert(eps); auto add_eps = m_Network->addElementWise(*pow_mean->getOutput(0), *eps->getOutput(0), ElementWiseOperation::kSUM); assert(add_eps); auto sqrt = m_Network->addUnary(*add_eps->getOutput(0), UnaryOperation::kSQRT); assert(sqrt); auto div = m_Network->addElementWise(*sub_mean->getOutput(0), *sqrt->getOutput(0), ElementWiseOperation::kDIV); assert(div); debug_print(div->getOutput(0),"div"); string weightsFile = lname + ".weight"; string biasFile = lname + ".bias"; int d_model = input->getDimensions().d[input->getDimensions().nbDims - 1]; cout<<"d_model = "<(malloc(sizeof(float) * d_model)); for (int i = 0; i < d_model; i++) { pval[i] = 1.0; } Weights norm1_power{ DataType::kFLOAT, pval, d_model }; auto affine = m_Network->addScaleNd( *div->getOutput(0), ScaleMode::kELEMENTWISE, weightMap[biasFile], weightMap[weightsFile], norm1_power,1); assert(affine); return affine->getOutput(0); } ITensor* conv(INetworkDefinition *m_Network,std::map weightMap,ITensor *input, string lname, int c_out,bool bias = true,int k = 4 , int s = 4, int p = 0) { Weights Bias{ DataType::kFLOAT, nullptr, 0 }; if(bias) Bias = weightMap[lname + ".bias"]; auto out = m_Network->addConvolutionNd(*input,c_out,Dims2{k,k},weightMap[lname + ".weight"],Bias); out->setStrideNd(Dims2{s,s}); out->setPaddingNd(Dims2{p,p}); out->setNbGroups(1); debug_print(out->getOutput(0),lname); return out->getOutput(0); } ITensor* shuffle_reshape(INetworkDefinition *m_Network,ITensor *input,Dims reshapeDims) { auto out = m_Network->addShuffle(*input); out->setReshapeDimensions(reshapeDims); debug_print(out->getOutput(0),"reshape"); return out->getOutput(0); } ITensor* shuffle_permute(INetworkDefinition *m_Network,ITensor *input,Permutation permutation) { auto out = m_Network->addShuffle(*input); out->setFirstTranspose(permutation); debug_print(out->getOutput(0),"permute"); return out->getOutput(0); } ITensor* shuffle_reshapeApermute(INetworkDefinition *m_Network,ITensor *input,Dims reshapeDims, Permutation permutation,bool firstReshape) { auto out = m_Network->addShuffle(*input); out->setReshapeDimensions(reshapeDims); if(firstReshape) out->setSecondTranspose(permutation); else out->setFirstTranspose(permutation); debug_print(out->getOutput(0),"shuffle"); return out->getOutput(0); } ITensor* trt_transform_imgMask(INetworkDefinition *m_Network,int hw, int window_size, int shift_size) { int Hp = hw; int Wp = hw; Weights Mask_param{DataType::kFLOAT,nullptr,Hp*Wp}; float *mask_param = new float[Hp*Wp]; for(int i = 0; i < Hp ; i++) { for(int j = 0; j < Wp; j++) { if(i=Wp-window_size && j < Wp-shift_size) mask_param[i*Wp + j] = 1.0; else if(i= Wp-shift_size) mask_param[i*Wp + j] = 2.0; else if(i >= Hp-window_size && i < Hp-shift_size && j= Hp-window_size && i < Hp-shift_size && j>=Wp-window_size && j < Wp-shift_size) mask_param[i*Wp + j] = 4.0; else if(i >= Hp-window_size && i < Hp-shift_size && j >= Wp-shift_size) mask_param[i*Wp + j] = 5.0; else if(i >= Hp-shift_size && j= Hp-shift_size && j>=Wp-window_size && j < Wp-shift_size) mask_param[i*Wp + j] = 7.0; else if(i >= Hp-shift_size && j >= Wp-shift_size) mask_param[i*Wp + j] = 8.0; else{ cout<<" i && j not limit"<addConstant(Dims4{1,Hp,Wp,1},Mask_param); auto img_mask_shuffle = m_Network->addShuffle(*img_mask->getOutput(0)); Dims shuffle1_dims; shuffle1_dims.nbDims = 6; int dims[] = {1,Hp/window_size,window_size,Wp/window_size,window_size,1}; for(int i = 0 ; i < 6; i++) shuffle1_dims.d[i] = dims[i]; img_mask_shuffle->setReshapeDimensions(shuffle1_dims); img_mask_shuffle->setSecondTranspose(Permutation{0,1,3,2,4,5}); auto img_mask_shuffle2 = m_Network->addShuffle(*img_mask_shuffle->getOutput(0)); img_mask_shuffle2->setReshapeDimensions(Dims3{-1,1,window_size*window_size}); auto img_mask_shuffle3 = m_Network->addShuffle(*img_mask_shuffle->getOutput(0)) ; img_mask_shuffle3->setReshapeDimensions(Dims3{-1,window_size*window_size,1}); auto atten_mask = m_Network->addElementWise(*img_mask_shuffle2->getOutput(0),*img_mask_shuffle3->getOutput(0),ElementWiseOperation::kSUB); auto creator = getPluginRegistry()->getPluginCreator("fillmaskLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin("fillmask", pluginData); ITensor* inputTensors[] = {atten_mask->getOutput(0)}; auto fillmask = m_Network->addPluginV2(inputTensors, 1, *pluginObj); debug_print(fillmask->getOutput(0),"imgMask"); return fillmask->getOutput(0); } ITensor* trt_transform_pad(INetworkDefinition *m_Network,ITensor *input,int window_size) { int h = input->getDimensions().d[0]; int w = input->getDimensions().d[1]; int c = input->getDimensions().d[2]; int pad_h = (window_size - h%window_size)%window_size; int pad_w = (window_size - w%window_size)%window_size; ITensor* temp = input; if(pad_h != 0) { Weights pad1{DataType::kFLOAT,nullptr,pad_h*w*c}; cout<addConstant(Dims3{pad_h,w,c},pad1); ITensor *cat1[2] = {temp,Pad1->getOutput(0)}; auto xp1 = m_Network->addConcatenation(cat1,2); xp1->setAxis(0); temp = xp1->getOutput(0); } if(pad_w != 0) { Weights pad2{DataType::kFLOAT,nullptr,pad_w*(h+pad_h)*c}; cout<addConstant(Dims3{(h+pad_h),pad_w,c},pad2); ITensor *cat2[] = {temp,Pad2->getOutput(0)}; auto xp2 = m_Network->addConcatenation(cat2,2); xp2->setAxis(1); temp = xp2->getOutput(0); } debug_print(temp, "pad"); return temp; } ITensor* trt_swinRoll(INetworkDefinition *m_Network,ITensor *input,vector shifts, vector dims) { int len = shifts.size(); Dims input_dim = input->getDimensions(); int nbdims = input_dim.nbDims; ITensor *temp = input; for(int i = 0 ; i < len; i++) { Dims start, size,stride; start.nbDims = nbdims; size.nbDims = nbdims; stride.nbDims = nbdims; if(shifts[i] > 0) { for(int j = 0 ; j < nbdims; j++) { if(j != (dims[i] -1 )) { start.d[j] = 0; size.d[j] = input_dim.d[j]; stride.d[j] = 1; } else{ start.d[j] = 0; size.d[j] = input_dim.d[j] - shifts[i]; stride.d[j] = 1; } } auto cat1 = m_Network->addSlice(*temp,start,size,stride); for(int j = 0 ; j < nbdims; j++) { if(j != (dims[i] - 1)) { start.d[j] = 0; size.d[j] = input_dim.d[j]; stride.d[j] = 1; } else{ start.d[j] = input_dim.d[j] - shifts[i]; size.d[j] = shifts[i]; stride.d[j] = 1; } } auto cat2 = m_Network->addSlice(*temp,start,size,stride); ITensor *cat[] ={cat2->getOutput(0),cat1->getOutput(0)}; auto Cat = m_Network->addConcatenation(cat,2); Cat->setAxis(dims[i] - 1); temp = Cat->getOutput(0); } if(shifts[i] < 0) { for(int j = 0 ; j < nbdims; j++) { if(j != (dims[i] - 1)) { start.d[j] = 0; size.d[j] = input_dim.d[j]; stride.d[j] = 1; } else{ start.d[j] = 0; size.d[j] = abs(shifts[i]); stride.d[j] = 1; } } auto cat1 = m_Network->addSlice(*temp,start,size,stride); debug_print(cat1->getOutput(0), "cat1 dims : "); for(int j = 0 ; j < nbdims; j++) { if(j != (dims[i] - 1)) { start.d[j] = 0; size.d[j] = input_dim.d[j]; stride.d[j] = 1; } else{ start.d[j] = abs(shifts[i]); size.d[j] = input_dim.d[j] - abs(shifts[i]); stride.d[j] = 1; } } auto cat2 = m_Network->addSlice(*temp,start,size,stride); debug_print(cat2->getOutput(0), "cat2 dims : "); ITensor *cat[] ={cat2->getOutput(0),cat1->getOutput(0)}; auto Cat = m_Network->addConcatenation(cat,2); Cat->setAxis(dims[i] - 1); temp = Cat->getOutput(0); } } return temp; } ITensor* trt_transform_window_partition(INetworkDefinition *m_Network,ITensor *input,int window_size) { auto shuffle1 = m_Network->addShuffle(*input); Dims shuffle1_dims; shuffle1_dims.nbDims = 5; int h = input->getDimensions().d[0]; int w = input->getDimensions().d[1]; int c = input->getDimensions().d[2]; int dims[] = {h/window_size,window_size,w/window_size,window_size,c}; for(int i = 0 ; i < shuffle1_dims.nbDims; i++) shuffle1_dims.d[i] = dims[i]; shuffle1->setReshapeDimensions(shuffle1_dims); shuffle1->setSecondTranspose(Permutation{0,2,1,3,4}); debug_print(shuffle1->getOutput(0)," shuffle1 dims : "); auto shuffle2 = m_Network->addShuffle(*shuffle1->getOutput(0)); shuffle2->setReshapeDimensions(Dims3{-1,window_size*window_size,c}); debug_print(shuffle2->getOutput(0), "window partition"); return shuffle2->getOutput(0); } ITensor* trt_swinLinear(INetworkDefinition *m_Network,std::map weightMap, ITensor *input, string lname, bool bias = true) { int c = input->getDimensions().d[input->getDimensions().nbDims-1]; string fc_wpath = lname + ".weight"; Weights fcW = weightMap[fc_wpath]; int len_fcw = fcW.count; if(len_fcw == 0) { cout<<"file is not open,please check it's path: "<getDimensions().nbDims; if(fcWdims.nbDims == 2) { fcWdims.d[0] = len_fcw/c; fcWdims.d[1] = c; } else { fcWdims.d[0] = 1; fcWdims.d[1] = len_fcw/c; fcWdims.d[2] = c; } auto fc_w_constant = m_Network->addConstant(fcWdims,fcW); auto fc_w_mm = m_Network->addMatrixMultiply(*input,MatrixOperation::kNONE, *fc_w_constant->getOutput(0),MatrixOperation::kTRANSPOSE); string fc_bpath = lname +".bias"; Weights fcB = weightMap[fc_bpath]; int len_fcb = fcB.count; if(!bias) { cout<getOutput(0),lname); return fc_w_mm->getOutput(0); } Dims fcBdims; fcBdims.nbDims = input->getDimensions().nbDims; if(fcBdims.nbDims == 2) { fcBdims.d[0] = 1; fcBdims.d[1] = len_fcb; } else { fcBdims.d[0] = 1; fcBdims.d[1] = 1; fcBdims.d[2] = len_fcb; } auto fc_b_constant = m_Network->addConstant(fcBdims,fcB); auto fc = m_Network->addElementWise(*fc_w_mm->getOutput(0),*fc_b_constant->getOutput(0),ElementWiseOperation::kSUM); debug_print(fc->getOutput(0),lname); return fc->getOutput(0); } ITensor* trt_trainsform_WindowAttention(INetworkDefinition *m_Network,std::map weightMap,ITensor *input, ITensor* mask,string lname,int dim, int num_heads,int window_size, int shift_size) { int b = input->getDimensions().d[0]; int n = input->getDimensions().d[1]; int c = input->getDimensions().d[2]; auto qkv = trt_swinLinear(m_Network,weightMap,input,lname+".qkv"); Dims qkv_dim; qkv_dim.nbDims = 5; int d[5] = {b,n,3,num_heads,c/num_heads}; for(int i = 0; i < 5; i++) qkv_dim.d[i] = d[i]; Permutation qkv_p; int p[5] = {2, 0, 3, 1, 4}; for(int i = 0; i < 5; i++) qkv_p.order[i] = p[i]; auto qkv_shuffle = shuffle_reshapeApermute(m_Network,qkv,qkv_dim,qkv_p,true); Dims qkvDims = qkv_shuffle->getDimensions(); Dims qstart,kstart,vstart,sizes,stride; qstart.nbDims = 5; kstart.nbDims = 5; vstart.nbDims = 5; sizes.nbDims = 5; stride.nbDims = 5; for(int i = 0; i < 5; i++) { if(i == 0) { qstart.d[0] = 0; kstart.d[0] = 1; vstart.d[0] = 2; sizes.d[0] = 1; stride.d[0] =1; } else{ qstart.d[i] = 0; kstart.d[i] = 0; vstart.d[i] = 0; sizes.d[i] = qkvDims.d[i]; stride.d[i] =1; } } auto q = m_Network->addSlice(*qkv_shuffle,qstart,sizes,stride); auto k = m_Network->addSlice(*qkv_shuffle,kstart,sizes,stride); auto v = m_Network->addSlice(*qkv_shuffle,vstart,sizes,stride); // q * s int len = 1; Weights scale_w{DataType::kFLOAT,nullptr,len}; float *scale = new float[len]; for(int i = 0 ; i < len; i++) scale[i] = 1 / sqrt(dim/num_heads); scale_w.values = scale; Dims scale_dim; scale_dim.nbDims = 5; for(int i = 0 ; i < 5; i++) scale_dim.d[i] = 1; auto Scale = m_Network->addConstant(scale_dim,scale_w); auto qs = m_Network->addElementWise(*q->getOutput(0),*Scale->getOutput(0),ElementWiseOperation::kPROD); auto qs_ = m_Network->addShuffle(*qs->getOutput(0)); qs_->setReshapeDimensions(Dims4{qkvDims.d[1],qkvDims.d[2],qkvDims.d[3],qkvDims.d[4]}); auto k_ = m_Network->addShuffle(*k->getOutput(0)); k_->setReshapeDimensions(Dims4{qkvDims.d[1],qkvDims.d[2],qkvDims.d[3],qkvDims.d[4]}); auto attn = m_Network->addMatrixMultiply(*qs_->getOutput(0),MatrixOperation::kNONE, *k_->getOutput(0),MatrixOperation::kTRANSPOSE); auto relatbias = m_Network->addConstant(Dims2{(2*window_size -1)*(2*window_size -1),num_heads},weightMap[lname + ".relative_position_bias_table"]); Dims r_i_dims; r_i_dims.nbDims = 1; r_i_dims.d[0] = window_size*window_size * window_size*window_size; Weights index{DataType::kINT32,nullptr,r_i_dims.d[0]}; int* idx = new int[r_i_dims.d[0]]; for (int i = 0; i < r_i_dims.d[0]; i++) { idx[i] =(int)((float*)weightMap[lname+".relative_position_index"].values)[i]; } //idx = (int*)weightMap[lname+".relative_position_index"].values; //cout<<"idx = "<<((float*)weightMap[lname+".relative_position_index"].values)[0]<addConstant(r_i_dims,index); auto relat = m_Network->addGather(*relatbias->getOutput(0),*relatidx->getOutput(0),0); auto relat_view = shuffle_reshapeApermute(m_Network,relat->getOutput(0), Dims4{1,window_size*window_size,window_size*window_size,-1}, Permutation{0,3,1,2},true); auto attn_rv = m_Network->addElementWise(*attn->getOutput(0),*relat_view,ElementWiseOperation::kSUM); ITensor *Attn_rv = attn_rv->getOutput(0); if (mask != nullptr) { Dims maskdims; maskdims.nbDims = mask->getDimensions().nbDims +1; maskdims.d[0] = mask->getDimensions().d[0]; maskdims.d[1] = 1; for(int i = 2; i< maskdims.nbDims; i++) { maskdims.d[i] = mask->getDimensions().d[i-1]; } auto maskshuffle = m_Network->addShuffle(*mask); maskshuffle->setReshapeDimensions(maskdims); auto attn_rnM = m_Network->addElementWise(*attn_rv->getOutput(0),*maskshuffle->getOutput(0),ElementWiseOperation::kSUM); Attn_rv = attn_rnM->getOutput(0); } auto attn_rv_s = m_Network->addSoftMax(*Attn_rv); attn_rv_s->setAxes(8); auto v_ = m_Network->addShuffle(*v->getOutput(0)); v_->setReshapeDimensions(Dims4{qkvDims.d[1],qkvDims.d[2],qkvDims.d[3],qkvDims.d[4]}); auto attn_v = m_Network->addMatrixMultiply(*attn_rv_s->getOutput(0),MatrixOperation::kNONE, *v_->getOutput(0),MatrixOperation::kNONE); auto x_reshape = shuffle_reshapeApermute(m_Network,attn_v->getOutput(0),Dims3{b,n,c},Permutation{0,2,1,3},false); auto x_linear = trt_swinLinear(m_Network,weightMap,x_reshape,lname+".proj"); return x_linear; } ITensor* trt_window_reverse(INetworkDefinition *m_Network, ITensor *input, int window_size, int H, int W) { Dims viewDims; viewDims.nbDims = 5; int d[5] = {H/window_size,W/window_size,window_size,window_size,-1}; for(int i = 0; i < 5; i++) viewDims.d[i] = d[i]; auto x_view = shuffle_reshape(m_Network,input,viewDims); auto output = shuffle_reshapeApermute(m_Network,x_view,Dims3{H,W,-1},Permutation{0,2,1,3,4},false); return output; } ITensor* gelu(INetworkDefinition *m_Network,ITensor *input) { auto creator = getPluginRegistry()->getPluginCreator("geluLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin("gelu", pluginData); ITensor* inputTensors[] = {input}; auto g = m_Network->addPluginV2(inputTensors, 1, *pluginObj); return g->getOutput(0); } //ITensor* adaptiveAvgPool2d(INetworkDefinition *m_Network,ITensor *input) //{ // auto creator = getPluginRegistry()->getPluginCreator("adaptiveAvgPooling_TRT", "1"); // const PluginFieldCollection* pluginData = creator->getFieldNames(); // IPluginV2 *pluginObj = creator->createPlugin("apAvgPool", pluginData); // ITensor* inputTensors[] = {input}; // auto g = m_Network->addPluginV2(inputTensors, 1, *pluginObj); // return g->getOutput(0); //} ITensor* trt_transform_mlp(INetworkDefinition *m_Network,std::map weightMap,ITensor *input, string lname,int dim,int mlp_ratio = 4) { // auto fc1 = m_Network->addFullyConnected(*input,dim * mlp_ratio, // weightMap[lname+".fc1.weight"],weightMap[lname+".fc1.bias"]); auto fc1 = trt_swinLinear(m_Network,weightMap,input,lname+".fc1"); auto act = gelu(m_Network,fc1); // auto fc2 = m_Network->addFullyConnected(*act,dim , // weightMap[lname+".fc2.weight"],weightMap[lname+".fc2.bias"]); auto fc2 = trt_swinLinear(m_Network,weightMap,act,lname+".fc2"); return fc2; } ITensor* blk(INetworkDefinition *m_Network,std::map weightMap,ITensor *input, ITensor* mask, string lname, int hw,int dim, int num_heads,int window_size,int shift_size,int mlp_ratio = 4) { int c = input->getDimensions().d[input->getDimensions().nbDims - 1]; auto x = input; auto norm1 = m_layerNorm(m_Network,weightMap,x,lname+".norm1"); //auto norm1 = x; auto view1 = shuffle_reshape(m_Network,norm1,Dims3{hw,hw,c}); auto pad = trt_transform_pad(m_Network,view1,window_size); int hp = pad->getDimensions().d[0]; int wp = pad->getDimensions().d[1]; ITensor* shifted_x; ITensor* atten_mask = nullptr; if(shift_size > 0) { shifted_x = trt_swinRoll(m_Network,pad,{-3,-3},{1,2}); atten_mask = mask; } else { shifted_x = pad; } auto x_windows = trt_transform_window_partition(m_Network,shifted_x,window_size); auto x_atten_windows = trt_trainsform_WindowAttention(m_Network,weightMap,x_windows,atten_mask,lname+".attn",dim,num_heads, window_size,shift_size); auto x_atten_windows_view = shuffle_reshape(m_Network,x_atten_windows,Dims4{-1,window_size,window_size,c}); shifted_x = trt_window_reverse(m_Network,x_atten_windows_view,window_size,hp,wp); if(shift_size > 0) { x = trt_swinRoll(m_Network,shifted_x,{3,3},{1,2}); } else { x = shifted_x; } if(hw < hp){ auto sss = m_Network->addSlice(*x,Dims3{0,0,0},Dims3{hw,hw,c},Dims3{1,1,1}); x = sss->getOutput(0); } x = shuffle_reshape(m_Network,x,Dims2{hw*hw,c}); x = m_Network->addElementWise(*x,*input,ElementWiseOperation::kSUM)->getOutput(0); auto norm2 = m_layerNorm(m_Network,weightMap,x,lname+".norm2"); //auto norm2 = x; auto mlp = trt_transform_mlp(m_Network,weightMap,norm2,lname+".mlp",dim); auto out= m_Network->addElementWise(*x,*mlp,ElementWiseOperation::kSUM)->getOutput(0); debug_print(out, "blk"); return out; } ITensor* downsample(INetworkDefinition* m_Network,std::map weightMap,ITensor *input, string lname, int hw) { int c = input->getDimensions().d[input->getDimensions().nbDims - 1]; auto x = shuffle_reshape(m_Network,input,Dims3{hw,hw,c}); auto x0 = m_Network->addSlice(*x,Dims3{0,0,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1}); auto x1 = m_Network->addSlice(*x,Dims3{1,0,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1}); auto x2 = m_Network->addSlice(*x,Dims3{0,1,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1}); auto x3 = m_Network->addSlice(*x,Dims3{1,1,0},Dims3{hw/2,hw/2,c},Dims3{2,2,1}); ITensor* inputTensors[] = { x0->getOutput(0), x1->getOutput(0), x2->getOutput(0), x3->getOutput(0) }; auto cat = m_Network->addConcatenation(inputTensors, 4); cat->setAxis(2); auto cat_view = shuffle_reshape(m_Network,cat->getOutput(0),Dims2{-1,4*c}); auto norm = m_layerNorm(m_Network,weightMap,cat_view,lname+".norm"); //auto norm = cat_view; auto reduction = trt_swinLinear(m_Network,weightMap,norm,lname+".reduction",false); return reduction; } ITensor* addBatchNorm2d( INetworkDefinition *network, std::map weightMap, ITensor* input, const std::string& lname, float eps = 1e-5 ) { float *gamma = (float*)(weightMap[lname + ".weight"].values); float *beta = (float*)(weightMap[lname + ".bias"].values); float *mean = (float*)(weightMap[lname + ".running_mean"].values); float *var = (float*)(weightMap[lname + ".running_var"].values); int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(*input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1->getOutput(0); } ITensor* transform_lateral_conv(INetworkDefinition* m_Network,std::map weightMap,ITensor* input, string lname, int k = 1, int s = 1,int out_features = 512) { Weights empty{DataType::kFLOAT,nullptr,0}; auto conv = m_Network->addConvolutionNd(*input,out_features,Dims2{k,k},weightMap[lname+".conv.weight"],empty); conv->setStrideNd(Dims2{s,s}); conv->setNbGroups(1); conv->setPaddingNd(Dims2{k/2,k/2}); ITensor* bn = addBatchNorm2d(m_Network,weightMap,conv->getOutput(0),lname+".bn"); auto act = m_Network->addActivation(*bn,ActivationType::kRELU); return act->getOutput(0); } ITensor* resize(INetworkDefinition* m_Network, ITensor* input, int grid) { float scale_h = 2.0f; float scale_w = 2.0f; scale_h = 1.0*grid / input->getDimensions().d[1]; scale_w = 1.0*grid / input->getDimensions().d[2]; auto creator = getPluginRegistry()->getPluginCreator("UpsamplePlugin", "1"); PluginField pField[1]; float *s = new float[2]; s[0] = scale_h; s[1] = scale_w; pField[0].data = s; pField[0].length = 2; pField[0].type = PluginFieldType::kFLOAT32; pField[0].name = "scaleFactor"; PluginFieldCollection pluginData; pluginData.nbFields = 1; pluginData.fields = pField; IPluginV2 *pluginObj = creator->createPlugin("upSample", &pluginData); ITensor* inputTensors[] = {input}; auto upS = m_Network->addPluginV2(inputTensors, 1, *pluginObj); return upS->getOutput(0); } ITensor* transform_psp(INetworkDefinition* m_Network,std::map weightMap,ITensor* input, string lname, int output_Avg_Size, int out_features = 512) { int inH = input->getDimensions().d[1]; int inW = input->getDimensions().d[2]; int kH = inH / output_Avg_Size; int kW = inW / output_Avg_Size; auto avgPool = m_Network->addPoolingNd(*input,PoolingType::kAVERAGE,Dims2{kH,kW}); avgPool->setStrideNd(Dims2{kH,kW}); auto cba = transform_lateral_conv(m_Network,weightMap,avgPool->getOutput(0),lname,1,1,out_features); auto out = resize(m_Network,cba,inH); return out; } ITensor* up_Add(INetworkDefinition* m_Network,ITensor* input1,ITensor* input2) { auto in1 = resize(m_Network,input1,input2->getDimensions().d[1]); auto out = m_Network->addElementWise(*in1,*input2,ElementWiseOperation::kSUM); return out->getOutput(0); } #endif // COMMON_HPP ================================================ FILE: swin-transformer/semantic-segmentation/fillmask.cu ================================================ #include "fillmask.h" #include namespace nvinfer1 { fillmask::fillmask() { } fillmask::~fillmask() { } // create the plugin at runtime from a byte stream fillmask::fillmask(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; Tn::read(d, mInputSize); assert(d == a + length); } void fillmask::serialize(void* buffer) const { char* d = static_cast(buffer), *a = d; Tn::write(d, mInputSize); assert(d == a + getSerializationSize()); } size_t fillmask::getSerializationSize() const { return sizeof(mInputSize); } int fillmask::initialize() { return 0; } Dims fillmask::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { assert(nbInputDims == 1); Dims outputDims; outputDims.nbDims = inputs[0].nbDims; for (int i = 0; i < inputs[0].nbDims; i++) { outputDims.d[i] = inputs[0].d[i]; } return outputDims; } // Set plugin namespace void fillmask::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* fillmask::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType fillmask::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool fillmask::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool fillmask::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void fillmask::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { mInputSize = 1; for (int i = 0; i < in[0].dims.nbDims; i++) { mInputSize *= in[0].dims.d[i]; } } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void fillmask::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void fillmask::detachFromContext() {} const char* fillmask::getPluginType() const { return "fillmaskLayer_TRT"; } const char* fillmask::getPluginVersion() const { return "1"; } void fillmask::destroy() { delete this; } // Clone the plugin IPluginV2IOExt* fillmask::clone() const { fillmask *p = new fillmask(); p->setPluginNamespace(mPluginNamespace); p->setInputSize(mInputSize); return p; } __global__ void fillmaskKer(const float *in, float *out, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= size) return; if (in[idx] != 0.0) out[idx] = -100.0; else out[idx] = 0.0; } void fillmask::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int numElem = batchSize * mInputSize; fillmaskKer<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[0], output, numElem); } int fillmask::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection fillmaskCreator::mFC{}; std::vector fillmaskCreator::mPluginAttributes; fillmaskCreator::fillmaskCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* fillmaskCreator::getPluginName() const { return "fillmaskLayer_TRT"; } const char* fillmaskCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* fillmaskCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* fillmaskCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { fillmask* obj = new fillmask(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* fillmaskCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will fillmask* obj = new fillmask(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: swin-transformer/semantic-segmentation/fillmask.h ================================================ #ifndef FILLMASK_H #define FILLMASK_H #include #include #include "NvInfer.h" #include "myhpp.h" #include #include "utilsn.h" namespace nvinfer1 { class fillmask:public IPluginV2IOExt { public: explicit fillmask(); fillmask(const void* data, size_t length); ~fillmask(); int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; void setInputSize(int s) { mInputSize = s; } private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); int mThreadCount = 256; int mInputSize; const char* mPluginNamespace; }; class fillmaskCreator : public IPluginCreator { public: fillmaskCreator(); ~fillmaskCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(fillmaskCreator); }; #endif // FILLMASK_H ================================================ FILE: swin-transformer/semantic-segmentation/gelu.cu ================================================ #include "gelu.h" #include namespace nvinfer1 { gelu::gelu() { } gelu::~gelu() { } // create the plugin at runtime from a byte stream gelu::gelu(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; Tn::read(d, mInputSize); assert(d == a + length); } void gelu::serialize(void* buffer) const { char* d = static_cast(buffer), *a = d; Tn::write(d, mInputSize); assert(d == a + getSerializationSize()); } size_t gelu::getSerializationSize() const { return sizeof(mInputSize); } int gelu::initialize() { return 0; } Dims gelu::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { assert(nbInputDims == 1); Dims outputDims; outputDims.nbDims = inputs[0].nbDims; for (int i = 0; i < inputs[0].nbDims; i++) { outputDims.d[i] = inputs[0].d[i]; } return outputDims; } // Set plugin namespace void gelu::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* gelu::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType gelu::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool gelu::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool gelu::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void gelu::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { mInputSize = 1; for (int i = 0; i < in[0].dims.nbDims; i++) { mInputSize *= in[0].dims.d[i]; } } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void gelu::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void gelu::detachFromContext() {} const char* gelu::getPluginType() const { return "geluLayer_TRT"; } const char* gelu::getPluginVersion() const { return "1"; } void gelu::destroy() { delete this; } // Clone the plugin IPluginV2IOExt* gelu::clone() const { gelu *p = new gelu(); p->setPluginNamespace(mPluginNamespace); p->setInputSize(mInputSize); return p; } __global__ void geluKer(const float *in, float *out, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= size) return; //x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) out[idx] = in[idx] * 0.5 *(1.0 + erf(in[idx]/1.4142135381698608)); } void gelu::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int numElem = batchSize * mInputSize; geluKer<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[0], output, numElem); } int gelu::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection geluCreator::mFC{}; std::vector geluCreator::mPluginAttributes; geluCreator::geluCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* geluCreator::getPluginName() const { return "geluLayer_TRT"; } const char* geluCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* geluCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* geluCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { gelu* obj = new gelu(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* geluCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will gelu* obj = new gelu(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: swin-transformer/semantic-segmentation/gelu.h ================================================ #ifndef GELU_H #define GELU_H #include #include #include "NvInfer.h" #include "myhpp.h" #include #include "utilsn.h" #define M_PI 3.14159265358979323846 // pi namespace nvinfer1 { class gelu:public IPluginV2IOExt { public: explicit gelu(); gelu(const void* data, size_t length); ~gelu(); int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; void setInputSize(int s) { mInputSize = s; } private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); int mThreadCount = 256; int mInputSize; const char* mPluginNamespace; }; class geluCreator : public IPluginCreator { public: geluCreator(); ~geluCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(geluCreator); }; #endif // GELU_H ================================================ FILE: swin-transformer/semantic-segmentation/gen_wts.py ================================================ import torch import struct import sys # Initialize pt_file = sys.argv[1] # Load model model = torch.load(pt_file, map_location=torch.device('cpu'))['model'].float() # load to FP32 model.to(device).eval() with open(pt_file.split('.')[0] + '.wts', 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') ================================================ FILE: swin-transformer/semantic-segmentation/include/dirent.h ================================================ /* * Dirent interface for Microsoft Visual Studio * * Copyright (C) 1998-2019 Toni Ronkko * This file is part of dirent. Dirent may be freely distributed * under the MIT license. For all details and documentation, see * https://github.com/tronkko/dirent */ #ifndef DIRENT_H #define DIRENT_H /* Hide warnings about unreferenced local functions */ #if defined(__clang__) # pragma clang diagnostic ignored "-Wunused-function" #elif defined(_MSC_VER) # pragma warning(disable:4505) #elif defined(__GNUC__) # pragma GCC diagnostic ignored "-Wunused-function" #endif /* * Include windows.h without Windows Sockets 1.1 to prevent conflicts with * Windows Sockets 2.0. */ #ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN #endif #include #include #include #include #include #include #include #include #include #include /* Indicates that d_type field is available in dirent structure */ #define _DIRENT_HAVE_D_TYPE /* Indicates that d_namlen field is available in dirent structure */ #define _DIRENT_HAVE_D_NAMLEN /* Entries missing from MSVC 6.0 */ #if !defined(FILE_ATTRIBUTE_DEVICE) # define FILE_ATTRIBUTE_DEVICE 0x40 #endif /* File type and permission flags for stat(), general mask */ #if !defined(S_IFMT) # define S_IFMT _S_IFMT #endif /* Directory bit */ #if !defined(S_IFDIR) # define S_IFDIR _S_IFDIR #endif /* Character device bit */ #if !defined(S_IFCHR) # define S_IFCHR _S_IFCHR #endif /* Pipe bit */ #if !defined(S_IFFIFO) # define S_IFFIFO _S_IFFIFO #endif /* Regular file bit */ #if !defined(S_IFREG) # define S_IFREG _S_IFREG #endif /* Read permission */ #if !defined(S_IREAD) # define S_IREAD _S_IREAD #endif /* Write permission */ #if !defined(S_IWRITE) # define S_IWRITE _S_IWRITE #endif /* Execute permission */ #if !defined(S_IEXEC) # define S_IEXEC _S_IEXEC #endif /* Pipe */ #if !defined(S_IFIFO) # define S_IFIFO _S_IFIFO #endif /* Block device */ #if !defined(S_IFBLK) # define S_IFBLK 0 #endif /* Link */ #if !defined(S_IFLNK) # define S_IFLNK 0 #endif /* Socket */ #if !defined(S_IFSOCK) # define S_IFSOCK 0 #endif /* Read user permission */ #if !defined(S_IRUSR) # define S_IRUSR S_IREAD #endif /* Write user permission */ #if !defined(S_IWUSR) # define S_IWUSR S_IWRITE #endif /* Execute user permission */ #if !defined(S_IXUSR) # define S_IXUSR 0 #endif /* Read group permission */ #if !defined(S_IRGRP) # define S_IRGRP 0 #endif /* Write group permission */ #if !defined(S_IWGRP) # define S_IWGRP 0 #endif /* Execute group permission */ #if !defined(S_IXGRP) # define S_IXGRP 0 #endif /* Read others permission */ #if !defined(S_IROTH) # define S_IROTH 0 #endif /* Write others permission */ #if !defined(S_IWOTH) # define S_IWOTH 0 #endif /* Execute others permission */ #if !defined(S_IXOTH) # define S_IXOTH 0 #endif /* Maximum length of file name */ #if !defined(PATH_MAX) # define PATH_MAX MAX_PATH #endif #if !defined(FILENAME_MAX) # define FILENAME_MAX MAX_PATH #endif #if !defined(NAME_MAX) # define NAME_MAX FILENAME_MAX #endif /* File type flags for d_type */ #define DT_UNKNOWN 0 #define DT_REG S_IFREG #define DT_DIR S_IFDIR #define DT_FIFO S_IFIFO #define DT_SOCK S_IFSOCK #define DT_CHR S_IFCHR #define DT_BLK S_IFBLK #define DT_LNK S_IFLNK /* Macros for converting between st_mode and d_type */ #define IFTODT(mode) ((mode) & S_IFMT) #define DTTOIF(type) (type) /* * File type macros. Note that block devices, sockets and links cannot be * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are * only defined for compatibility. These macros should always return false * on Windows. */ #if !defined(S_ISFIFO) # define S_ISFIFO(mode) (((mode) & S_IFMT) == S_IFIFO) #endif #if !defined(S_ISDIR) # define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) #endif #if !defined(S_ISREG) # define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) #endif #if !defined(S_ISLNK) # define S_ISLNK(mode) (((mode) & S_IFMT) == S_IFLNK) #endif #if !defined(S_ISSOCK) # define S_ISSOCK(mode) (((mode) & S_IFMT) == S_IFSOCK) #endif #if !defined(S_ISCHR) # define S_ISCHR(mode) (((mode) & S_IFMT) == S_IFCHR) #endif #if !defined(S_ISBLK) # define S_ISBLK(mode) (((mode) & S_IFMT) == S_IFBLK) #endif /* Return the exact length of the file name without zero terminator */ #define _D_EXACT_NAMLEN(p) ((p)->d_namlen) /* Return the maximum size of a file name */ #define _D_ALLOC_NAMLEN(p) ((PATH_MAX)+1) #ifdef __cplusplus extern "C" { #endif /* Wide-character version */ struct _wdirent { /* Always zero */ long d_ino; /* File position within stream */ long d_off; /* Structure size */ unsigned short d_reclen; /* Length of name without \0 */ size_t d_namlen; /* File type */ int d_type; /* File name */ wchar_t d_name[PATH_MAX+1]; }; typedef struct _wdirent _wdirent; struct _WDIR { /* Current directory entry */ struct _wdirent ent; /* Private file data */ WIN32_FIND_DATAW data; /* True if data is valid */ int cached; /* Win32 search handle */ HANDLE handle; /* Initial directory name */ wchar_t *patt; }; typedef struct _WDIR _WDIR; /* Multi-byte character version */ struct dirent { /* Always zero */ long d_ino; /* File position within stream */ long d_off; /* Structure size */ unsigned short d_reclen; /* Length of name without \0 */ size_t d_namlen; /* File type */ int d_type; /* File name */ char d_name[PATH_MAX+1]; }; typedef struct dirent dirent; struct DIR { struct dirent ent; struct _WDIR *wdirp; }; typedef struct DIR DIR; /* Dirent functions */ static DIR *opendir (const char *dirname); static _WDIR *_wopendir (const wchar_t *dirname); static struct dirent *readdir (DIR *dirp); static struct _wdirent *_wreaddir (_WDIR *dirp); static int readdir_r( DIR *dirp, struct dirent *entry, struct dirent **result); static int _wreaddir_r( _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result); static int closedir (DIR *dirp); static int _wclosedir (_WDIR *dirp); static void rewinddir (DIR* dirp); static void _wrewinddir (_WDIR* dirp); static int scandir (const char *dirname, struct dirent ***namelist, int (*filter)(const struct dirent*), int (*compare)(const struct dirent**, const struct dirent**)); static int alphasort (const struct dirent **a, const struct dirent **b); static int versionsort (const struct dirent **a, const struct dirent **b); /* For compatibility with Symbian */ #define wdirent _wdirent #define WDIR _WDIR #define wopendir _wopendir #define wreaddir _wreaddir #define wclosedir _wclosedir #define wrewinddir _wrewinddir /* Internal utility functions */ static WIN32_FIND_DATAW *dirent_first (_WDIR *dirp); static WIN32_FIND_DATAW *dirent_next (_WDIR *dirp); static int dirent_mbstowcs_s( size_t *pReturnValue, wchar_t *wcstr, size_t sizeInWords, const char *mbstr, size_t count); static int dirent_wcstombs_s( size_t *pReturnValue, char *mbstr, size_t sizeInBytes, const wchar_t *wcstr, size_t count); static void dirent_set_errno (int error); /* * Open directory stream DIRNAME for read and return a pointer to the * internal working area that is used to retrieve individual directory * entries. */ static _WDIR* _wopendir( const wchar_t *dirname) { _WDIR *dirp; DWORD n; wchar_t *p; /* Must have directory name */ if (dirname == NULL || dirname[0] == '\0') { dirent_set_errno (ENOENT); return NULL; } /* Allocate new _WDIR structure */ dirp = (_WDIR*) malloc (sizeof (struct _WDIR)); if (!dirp) { return NULL; } /* Reset _WDIR structure */ dirp->handle = INVALID_HANDLE_VALUE; dirp->patt = NULL; dirp->cached = 0; /* * Compute the length of full path plus zero terminator * * Note that on WinRT there's no way to convert relative paths * into absolute paths, so just assume it is an absolute path. */ #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) /* Desktop */ n = GetFullPathNameW (dirname, 0, NULL, NULL); #else /* WinRT */ n = wcslen (dirname); #endif /* Allocate room for absolute directory name and search pattern */ dirp->patt = (wchar_t*) malloc (sizeof (wchar_t) * n + 16); if (dirp->patt == NULL) { goto exit_closedir; } /* * Convert relative directory name to an absolute one. This * allows rewinddir() to function correctly even when current * working directory is changed between opendir() and rewinddir(). * * Note that on WinRT there's no way to convert relative paths * into absolute paths, so just assume it is an absolute path. */ #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) /* Desktop */ n = GetFullPathNameW (dirname, n, dirp->patt, NULL); if (n <= 0) { goto exit_closedir; } #else /* WinRT */ wcsncpy_s (dirp->patt, n+1, dirname, n); #endif /* Append search pattern \* to the directory name */ p = dirp->patt + n; switch (p[-1]) { case '\\': case '/': case ':': /* Directory ends in path separator, e.g. c:\temp\ */ /*NOP*/; break; default: /* Directory name doesn't end in path separator */ *p++ = '\\'; } *p++ = '*'; *p = '\0'; /* Open directory stream and retrieve the first entry */ if (!dirent_first (dirp)) { goto exit_closedir; } /* Success */ return dirp; /* Failure */ exit_closedir: _wclosedir (dirp); return NULL; } /* * Read next directory entry. * * Returns pointer to static directory entry which may be overwritten by * subsequent calls to _wreaddir(). */ static struct _wdirent* _wreaddir( _WDIR *dirp) { struct _wdirent *entry; /* * Read directory entry to buffer. We can safely ignore the return value * as entry will be set to NULL in case of error. */ (void) _wreaddir_r (dirp, &dirp->ent, &entry); /* Return pointer to statically allocated directory entry */ return entry; } /* * Read next directory entry. * * Returns zero on success. If end of directory stream is reached, then sets * result to NULL and returns zero. */ static int _wreaddir_r( _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result) { WIN32_FIND_DATAW *datap; /* Read next directory entry */ datap = dirent_next (dirp); if (datap) { size_t n; DWORD attr; /* * Copy file name as wide-character string. If the file name is too * long to fit in to the destination buffer, then truncate file name * to PATH_MAX characters and zero-terminate the buffer. */ n = 0; while (n < PATH_MAX && datap->cFileName[n] != 0) { entry->d_name[n] = datap->cFileName[n]; n++; } entry->d_name[n] = 0; /* Length of file name excluding zero terminator */ entry->d_namlen = n; /* File type */ attr = datap->dwFileAttributes; if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { entry->d_type = DT_CHR; } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { entry->d_type = DT_DIR; } else { entry->d_type = DT_REG; } /* Reset dummy fields */ entry->d_ino = 0; entry->d_off = 0; entry->d_reclen = sizeof (struct _wdirent); /* Set result address */ *result = entry; } else { /* Return NULL to indicate end of directory */ *result = NULL; } return /*OK*/0; } /* * Close directory stream opened by opendir() function. This invalidates the * DIR structure as well as any directory entry read previously by * _wreaddir(). */ static int _wclosedir( _WDIR *dirp) { int ok; if (dirp) { /* Release search handle */ if (dirp->handle != INVALID_HANDLE_VALUE) { FindClose (dirp->handle); } /* Release search pattern */ free (dirp->patt); /* Release directory structure */ free (dirp); ok = /*success*/0; } else { /* Invalid directory stream */ dirent_set_errno (EBADF); ok = /*failure*/-1; } return ok; } /* * Rewind directory stream such that _wreaddir() returns the very first * file name again. */ static void _wrewinddir( _WDIR* dirp) { if (dirp) { /* Release existing search handle */ if (dirp->handle != INVALID_HANDLE_VALUE) { FindClose (dirp->handle); } /* Open new search handle */ dirent_first (dirp); } } /* Get first directory entry (internal) */ static WIN32_FIND_DATAW* dirent_first( _WDIR *dirp) { WIN32_FIND_DATAW *datap; DWORD error; /* Open directory and retrieve the first entry */ dirp->handle = FindFirstFileExW( dirp->patt, FindExInfoStandard, &dirp->data, FindExSearchNameMatch, NULL, 0); if (dirp->handle != INVALID_HANDLE_VALUE) { /* a directory entry is now waiting in memory */ datap = &dirp->data; dirp->cached = 1; } else { /* Failed to open directory: no directory entry in memory */ dirp->cached = 0; datap = NULL; /* Set error code */ error = GetLastError (); switch (error) { case ERROR_ACCESS_DENIED: /* No read access to directory */ dirent_set_errno (EACCES); break; case ERROR_DIRECTORY: /* Directory name is invalid */ dirent_set_errno (ENOTDIR); break; case ERROR_PATH_NOT_FOUND: default: /* Cannot find the file */ dirent_set_errno (ENOENT); } } return datap; } /* * Get next directory entry (internal). * * Returns */ static WIN32_FIND_DATAW* dirent_next( _WDIR *dirp) { WIN32_FIND_DATAW *p; /* Get next directory entry */ if (dirp->cached != 0) { /* A valid directory entry already in memory */ p = &dirp->data; dirp->cached = 0; } else if (dirp->handle != INVALID_HANDLE_VALUE) { /* Get the next directory entry from stream */ if (FindNextFileW (dirp->handle, &dirp->data) != FALSE) { /* Got a file */ p = &dirp->data; } else { /* The very last entry has been processed or an error occurred */ FindClose (dirp->handle); dirp->handle = INVALID_HANDLE_VALUE; p = NULL; } } else { /* End of directory stream reached */ p = NULL; } return p; } /* * Open directory stream using plain old C-string. */ static DIR* opendir( const char *dirname) { struct DIR *dirp; /* Must have directory name */ if (dirname == NULL || dirname[0] == '\0') { dirent_set_errno (ENOENT); return NULL; } /* Allocate memory for DIR structure */ dirp = (DIR*) malloc (sizeof (struct DIR)); if (!dirp) { return NULL; } { int error; wchar_t wname[PATH_MAX + 1]; size_t n; /* Convert directory name to wide-character string */ error = dirent_mbstowcs_s( &n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1); if (error) { /* * Cannot convert file name to wide-character string. This * occurs if the string contains invalid multi-byte sequences or * the output buffer is too small to contain the resulting * string. */ goto exit_free; } /* Open directory stream using wide-character name */ dirp->wdirp = _wopendir (wname); if (!dirp->wdirp) { goto exit_free; } } /* Success */ return dirp; /* Failure */ exit_free: free (dirp); return NULL; } /* * Read next directory entry. */ static struct dirent* readdir( DIR *dirp) { struct dirent *entry; /* * Read directory entry to buffer. We can safely ignore the return value * as entry will be set to NULL in case of error. */ (void) readdir_r (dirp, &dirp->ent, &entry); /* Return pointer to statically allocated directory entry */ return entry; } /* * Read next directory entry into called-allocated buffer. * * Returns zero on success. If the end of directory stream is reached, then * sets result to NULL and returns zero. */ static int readdir_r( DIR *dirp, struct dirent *entry, struct dirent **result) { WIN32_FIND_DATAW *datap; /* Read next directory entry */ datap = dirent_next (dirp->wdirp); if (datap) { size_t n; int error; /* Attempt to convert file name to multi-byte string */ error = dirent_wcstombs_s( &n, entry->d_name, PATH_MAX + 1, datap->cFileName, PATH_MAX + 1); /* * If the file name cannot be represented by a multi-byte string, * then attempt to use old 8+3 file name. This allows traditional * Unix-code to access some file names despite of unicode * characters, although file names may seem unfamiliar to the user. * * Be ware that the code below cannot come up with a short file * name unless the file system provides one. At least * VirtualBox shared folders fail to do this. */ if (error && datap->cAlternateFileName[0] != '\0') { error = dirent_wcstombs_s( &n, entry->d_name, PATH_MAX + 1, datap->cAlternateFileName, PATH_MAX + 1); } if (!error) { DWORD attr; /* Length of file name excluding zero terminator */ entry->d_namlen = n - 1; /* File attributes */ attr = datap->dwFileAttributes; if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { entry->d_type = DT_CHR; } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { entry->d_type = DT_DIR; } else { entry->d_type = DT_REG; } /* Reset dummy fields */ entry->d_ino = 0; entry->d_off = 0; entry->d_reclen = sizeof (struct dirent); } else { /* * Cannot convert file name to multi-byte string so construct * an erroneous directory entry and return that. Note that * we cannot return NULL as that would stop the processing * of directory entries completely. */ entry->d_name[0] = '?'; entry->d_name[1] = '\0'; entry->d_namlen = 1; entry->d_type = DT_UNKNOWN; entry->d_ino = 0; entry->d_off = -1; entry->d_reclen = 0; } /* Return pointer to directory entry */ *result = entry; } else { /* No more directory entries */ *result = NULL; } return /*OK*/0; } /* * Close directory stream. */ static int closedir( DIR *dirp) { int ok; if (dirp) { /* Close wide-character directory stream */ ok = _wclosedir (dirp->wdirp); dirp->wdirp = NULL; /* Release multi-byte character version */ free (dirp); } else { /* Invalid directory stream */ dirent_set_errno (EBADF); ok = /*failure*/-1; } return ok; } /* * Rewind directory stream to beginning. */ static void rewinddir( DIR* dirp) { /* Rewind wide-character string directory stream */ _wrewinddir (dirp->wdirp); } /* * Scan directory for entries. */ static int scandir( const char *dirname, struct dirent ***namelist, int (*filter)(const struct dirent*), int (*compare)(const struct dirent**, const struct dirent**)) { struct dirent **files = NULL; size_t size = 0; size_t allocated = 0; const size_t init_size = 1; DIR *dir = NULL; struct dirent *entry; struct dirent *tmp = NULL; size_t i; int result = 0; /* Open directory stream */ dir = opendir (dirname); if (dir) { /* Read directory entries to memory */ while (1) { /* Enlarge pointer table to make room for another pointer */ if (size >= allocated) { void *p; size_t num_entries; /* Compute number of entries in the enlarged pointer table */ if (size < init_size) { /* Allocate initial pointer table */ num_entries = init_size; } else { /* Double the size */ num_entries = size * 2; } /* Allocate first pointer table or enlarge existing table */ p = realloc (files, sizeof (void*) * num_entries); if (p != NULL) { /* Got the memory */ files = (dirent**) p; allocated = num_entries; } else { /* Out of memory */ result = -1; break; } } /* Allocate room for temporary directory entry */ if (tmp == NULL) { tmp = (struct dirent*) malloc (sizeof (struct dirent)); if (tmp == NULL) { /* Cannot allocate temporary directory entry */ result = -1; break; } } /* Read directory entry to temporary area */ if (readdir_r (dir, tmp, &entry) == /*OK*/0) { /* Did we get an entry? */ if (entry != NULL) { int pass; /* Determine whether to include the entry in result */ if (filter) { /* Let the filter function decide */ pass = filter (tmp); } else { /* No filter function, include everything */ pass = 1; } if (pass) { /* Store the temporary entry to pointer table */ files[size++] = tmp; tmp = NULL; /* Keep up with the number of files */ result++; } } else { /* * End of directory stream reached => sort entries and * exit. */ qsort (files, size, sizeof (void*), (int (*) (const void*, const void*)) compare); break; } } else { /* Error reading directory entry */ result = /*Error*/ -1; break; } } } else { /* Cannot open directory */ result = /*Error*/ -1; } /* Release temporary directory entry */ free (tmp); /* Release allocated memory on error */ if (result < 0) { for (i = 0; i < size; i++) { free (files[i]); } free (files); files = NULL; } /* Close directory stream */ if (dir) { closedir (dir); } /* Pass pointer table to caller */ if (namelist) { *namelist = files; } return result; } /* Alphabetical sorting */ static int alphasort( const struct dirent **a, const struct dirent **b) { return strcoll ((*a)->d_name, (*b)->d_name); } /* Sort versions */ static int versionsort( const struct dirent **a, const struct dirent **b) { /* FIXME: implement strverscmp and use that */ return alphasort (a, b); } /* Convert multi-byte string to wide character string */ static int dirent_mbstowcs_s( size_t *pReturnValue, wchar_t *wcstr, size_t sizeInWords, const char *mbstr, size_t count) { int error; #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 or later */ error = mbstowcs_s (pReturnValue, wcstr, sizeInWords, mbstr, count); #else /* Older Visual Studio or non-Microsoft compiler */ size_t n; /* Convert to wide-character string (or count characters) */ n = mbstowcs (wcstr, mbstr, sizeInWords); if (!wcstr || n < count) { /* Zero-terminate output buffer */ if (wcstr && sizeInWords) { if (n >= sizeInWords) { n = sizeInWords - 1; } wcstr[n] = 0; } /* Length of resulting multi-byte string WITH zero terminator */ if (pReturnValue) { *pReturnValue = n + 1; } /* Success */ error = 0; } else { /* Could not convert string */ error = 1; } #endif return error; } /* Convert wide-character string to multi-byte string */ static int dirent_wcstombs_s( size_t *pReturnValue, char *mbstr, size_t sizeInBytes, /* max size of mbstr */ const wchar_t *wcstr, size_t count) { int error; #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 or later */ error = wcstombs_s (pReturnValue, mbstr, sizeInBytes, wcstr, count); #else /* Older Visual Studio or non-Microsoft compiler */ size_t n; /* Convert to multi-byte string (or count the number of bytes needed) */ n = wcstombs (mbstr, wcstr, sizeInBytes); if (!mbstr || n < count) { /* Zero-terminate output buffer */ if (mbstr && sizeInBytes) { if (n >= sizeInBytes) { n = sizeInBytes - 1; } mbstr[n] = '\0'; } /* Length of resulting multi-bytes string WITH zero-terminator */ if (pReturnValue) { *pReturnValue = n + 1; } /* Success */ error = 0; } else { /* Cannot convert string */ error = 1; } #endif return error; } /* Set errno variable */ static void dirent_set_errno( int error) { #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 and later */ _set_errno (error); #else /* Non-Microsoft compiler or older Microsoft compiler */ errno = error; #endif } #ifdef __cplusplus } #endif #endif /*DIRENT_H*/ ================================================ FILE: swin-transformer/semantic-segmentation/layerNorm.cu ================================================ #include #include "layerNorm.h" #include "utilsn.h" #include #include namespace nvinfer1 { layernorm::layernorm() { } layernorm::~layernorm() { } layernorm::layernorm(const void* data, size_t length) { const char *d = reinterpret_cast(data), *a = d; Tn::read(d, mInputSize); Tn::read(d,Length); assert(d == a + length); } int layernorm::initialize() { return 0; } void layernorm::serialize(void* buffer) const { char* d = static_cast(buffer), *a = d; Tn::write(d, mInputSize); Tn::write(d,Length); assert(d == a + getSerializationSize()); } size_t layernorm::getSerializationSize() const { return sizeof(mInputSize) + sizeof(Length); } Dims layernorm::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { // outputDims.nbDims = inputs[0].nbDims; // outputDims.d[0] = inputs[0].d[0]; // for (int var = 1; var < inputs[0].nbDims; ++var) { // outputDims.d[var] = 1; // } return Dims2{inputs[0].d[0],1}; } void layernorm::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* layernorm::getPluginNamespace() const { return mPluginNamespace; } const char* layernorm::getPluginType() const { return "layerNorm_trt"; } const char* layernorm::getPluginVersion() const { return "1"; } DataType layernorm::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return inputTypes[0] ;//== nvinfer1::DataType::kFLOAT ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kHALF; } void layernorm::destroy() { delete this; } IPluginV2IOExt* layernorm::clone() const { layernorm *ln = new layernorm(); ln->setPluginNamespace(mPluginNamespace); ln->setInputSize(mInputSize,Length); return ln; } bool layernorm::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } bool layernorm::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void layernorm::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) {} void layernorm::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { int size = 1; for(int i = 0 ; i < in[0].dims.nbDims ; i++) { size *= in[0].dims.d[i]; } mInputSize = size; Length = in[0].dims.d[in[0].dims.nbDims - 1]; } void layernorm::detachFromContext() {} __device__ welford welford_update(welford a, const float *currValue, int length) { #pragma unroll for(int i = 0; i < length; i++){ a.count += 1; float delta = currValue[i] - a.mean; a.mean += delta / a.count; float delta2 = currValue[i] - a.mean; a.M2 += delta * delta2; } return a; } __device__ void mean_std(float* mean, float *std, const float *currValue,int l,int count = 0, float m = 0.0, float s = 0.0) { #pragma unroll for(int i = 0; i < l; i++){ count += 1; float delta = currValue[i] - m; m += delta / count; float delta2 = currValue[i] - m; s += delta * delta2; } *mean = m; *std = sqrt((s / count) + 1e-5); } __global__ void lnCudaKer(const float *in, float *mean, float *std, int size,int l) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= size) return; mean_std(&mean[idx],&std[idx],in+idx*l,l); //printf("idx = %d,mean = %f, std = %f\n",idx,mean[idx],std[idx]); } void layernorm::forwardGpu(const float *const *inputs, float *mean, float *std, cudaStream_t stream, int batchSize) { int numElem = batchSize * mInputSize/Length; lnCudaKer<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[0], mean,std, numElem,Length); } int layernorm::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { forwardGpu((const float *const *)inputs, (float*)outputs[0], (float*)outputs[1], stream, batchSize); return 0; } PluginFieldCollection layernormCreator::mFC{}; std::vector layernormCreator::mPluginAttributes; layernormCreator::layernormCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* layernormCreator::getPluginName() const { return "layerNorm_trt"; } const char* layernormCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* layernormCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* layernormCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { layernorm* obj = new layernorm(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* layernormCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { layernorm* obj = new layernorm(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: swin-transformer/semantic-segmentation/layerNorm.h ================================================ #ifndef LAYERNORM_H #define LAYERNORM_H #include #include #include #include #include #include #include #include using namespace std; struct welford { int count = 0; double mean = 0.f; double M2 = 0.f; }; namespace nvinfer1{ class layernorm : public IPluginV2IOExt { public: layernorm(); layernorm(const void* data, size_t length); ~layernorm(); int getNbOutputs() const override { return 2; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; } virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; void setInputSize(int s, int l) { mInputSize = s; Length = l; } private: void forwardGpu(const float *const * inputs, float *mean, float *std, cudaStream_t stream, int batchSize = 1); int mThreadCount = 256; int mInputSize; int Length; Dims outputDims ; const char* mPluginNamespace; }; class layernormCreator : public IPluginCreator { public: layernormCreator(); ~layernormCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(layernormCreator); }; #endif // LAYERNORM_H ================================================ FILE: swin-transformer/semantic-segmentation/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: swin-transformer/semantic-segmentation/main.cpp ================================================ #include using namespace std; ================================================ FILE: swin-transformer/semantic-segmentation/myhpp.h ================================================ #ifndef MYHPP_H #define MYHPP_H #include #include #include #include #define _USE_MATH_DEFINES #include #include #include #include #include #include #include #include #include #include #include #include #include //#include #include #include //#include #include #include #include #include #include #include #include #endif // MYHPP_H ================================================ FILE: swin-transformer/semantic-segmentation/trainsform.cpp ================================================ #include "common.hpp" #include "logging.h" #include #include #include #include #include #include #define USE_FP32 static Logger gLogger; const char *INPUT_BLOB_NAME = "data"; const char *OUTPUT_BLOB_NAME = "output"; static const int bs = 1; static const int channels = 96; static const int ch = 3; static const int INPUT_H = 576; static const int INPUT_W = 576; static const int NUM_CLASSES = 15; static const int outputSize = 576 * 576; cudaStream_t m_cudaStream; vector m_bindings; IExecutionContext *m_context; ICudaEngine *createEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt,std::string wtsPath) { INetworkDefinition *network = builder->createNetworkV2(0U); ITensor *data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ch, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights(wtsPath); ITensor* conv1 = conv(network, weightMap, data, "backbone.patch_embed.proj", channels); ITensor* shuffle1 = shuffle_reshapeApermute(network, conv1, Dims2{channels, -1}, Permutation{1, 0}, true); ITensor *ln = m_layerNorm(network, weightMap, shuffle1, "backbone.patch_embed.norm"); debug_print(ln, "ln"); //layer0 ITensor *mask0 = trt_transform_imgMask(network, 147, 7, 3); ITensor *blk00 = blk(network, weightMap, ln, mask0, "backbone.layers.0.blocks.0", INPUT_H / 4, channels, 3, 7, 0); debug_print(blk00, "blk00"); ITensor *blk01 = blk(network, weightMap, blk00, mask0, "backbone.layers.0.blocks.1", INPUT_H / 4, channels, 3, 7, 3); debug_print(blk01, "blk01"); ITensor* out0 = m_layerNorm(network, weightMap, blk01, "backbone.norm0"); out0 = shuffle_reshapeApermute(network, out0, Dims3{INPUT_H / 4, INPUT_H / 4, channels}, Permutation{2, 0, 1}, true); ITensor *down_layer0 = downsample(network, weightMap, blk01, "backbone.layers.0.downsample", INPUT_H / 4); debug_print(down_layer0, "down_blk1"); //layer1 ITensor *mask1 = trt_transform_imgMask(network, 77, 7, 3); ITensor *blk10 = blk(network, weightMap, down_layer0, mask1, "backbone.layers.1.blocks.0", INPUT_H / 8, channels * 2, 6, 7, 0); debug_print(blk10, "blk10"); ITensor *blk11 = blk(network, weightMap, blk10, mask1, "backbone.layers.1.blocks.1", INPUT_H / 8, channels * 2, 6, 7, 3); debug_print(blk11, "blk11"); ITensor* out1 = m_layerNorm(network, weightMap, blk11, "backbone.norm1"); out1 = shuffle_reshapeApermute(network, out1, Dims3{INPUT_H / 8, INPUT_H / 8, channels * 2}, Permutation{2, 0, 1}, true); ITensor *down_layer1 = downsample(network, weightMap, blk11, "backbone.layers.1.downsample", INPUT_H / 8); debug_print(down_layer1, "down_layer1"); //layer2 ITensor *mask2 = trt_transform_imgMask(network, 42, 7, 3); ITensor *blk20 = blk(network, weightMap, down_layer1, mask2, "backbone.layers.2.blocks.0", INPUT_H / 16, channels * 4, 12, 7, 0); debug_print(blk20, "blk20"); ITensor *blk21 = blk(network, weightMap, blk20, mask2, "backbone.layers.2.blocks.1", INPUT_H / 16, channels * 4, 12, 7, 3); debug_print(blk21, "blk21"); ITensor *blk22 = blk(network, weightMap, blk21, mask2, "backbone.layers.2.blocks.2", INPUT_H / 16,channels * 4, 12, 7, 0); debug_print(blk22, "blk22"); ITensor *blk23 = blk(network, weightMap, blk22, mask2, "backbone.layers.2.blocks.3", INPUT_H / 16, channels * 4, 12, 7, 3); debug_print(blk23, "blk23"); ITensor *blk24 = blk(network, weightMap, blk23, mask2, "backbone.layers.2.blocks.4", INPUT_H / 16, channels * 4, 12, 7, 0); debug_print(blk24, "blk24"); ITensor *blk25 = blk(network, weightMap, blk24, mask2, "backbone.layers.2.blocks.5", INPUT_H / 16, channels * 4, 12, 7, 3); debug_print(blk25, "blk25"); ITensor* out2 = m_layerNorm(network, weightMap, blk25, "backbone.norm2"); out2 = shuffle_reshapeApermute(network, out2, Dims3{INPUT_H / 16, INPUT_H / 16, channels * 4}, Permutation{2, 0, 1}, true); ITensor *down_layer2 = downsample(network, weightMap, blk25, "backbone.layers.2.downsample", INPUT_H / 16); debug_print(down_layer2, "down_layer2"); //layer3 ITensor *mask3 = trt_transform_imgMask(network, 21, 7, 3); ITensor *blk30 = blk(network, weightMap, down_layer2, mask3, "backbone.layers.3.blocks.0", INPUT_H / 32, channels * 8, 24, 7, 0); debug_print(blk30, "blk30"); ITensor *blk31 = blk(network, weightMap, blk30, mask3, "backbone.layers.3.blocks.1", INPUT_H / 32, channels * 8, 24, 7, 3); debug_print(blk31, "blk31"); ITensor* out3 = m_layerNorm(network, weightMap, blk31, "backbone.norm3"); out3 = shuffle_reshapeApermute(network, out3, Dims3{INPUT_H / 32, INPUT_H / 32, channels * 8}, Permutation{2, 0, 1}, true); ITensor* out[4] = {out0, out1, out2, out3}; out0 = transform_lateral_conv(network, weightMap, out0, "decode_head.lateral_convs.0"); // 512,INPUT_H/4,INPUT_H/4 out1 = transform_lateral_conv(network, weightMap, out1, "decode_head.lateral_convs.1"); // 512,INPUT_H/8,INPUT_H/8 out2 = transform_lateral_conv(network, weightMap, out2, "decode_head.lateral_convs.2"); // 512,INPUT_H/16,INPUT_H/16 auto psp_out_0 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.0.1", 1); auto psp_out_1 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.1.1", 2); auto psp_out_2 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.2.1", 3); auto psp_out_3 = transform_psp(network, weightMap, out3, "decode_head.psp_modules.3.1", 6); ITensor* psp_outs[5] = {out3, psp_out_0, psp_out_1, psp_out_2, psp_out_3}; auto PSP_outs = network->addConcatenation(psp_outs, 5); PSP_outs->setAxis(0); debug_print(PSP_outs->getOutput(0), "PSP_outs"); out3 = transform_lateral_conv(network, weightMap, PSP_outs->getOutput(0), "decode_head.bottleneck", 3, 1, 512); // 512,INPUT_H/32,INPUT_H/32 debug_print(out3, "out3"); auto laterals2 = up_Add(network, out3, out2); auto laterals1 = up_Add(network, laterals2, out1); auto laterals0 = up_Add(network, laterals1, out0); auto fpn0 = transform_lateral_conv(network, weightMap, laterals0, "decode_head.fpn_convs.0", 3, 1, 512); auto fpn1 = transform_lateral_conv(network, weightMap, laterals1, "decode_head.fpn_convs.1", 3, 1, 512); auto fpn2 = transform_lateral_conv(network, weightMap, laterals2, "decode_head.fpn_convs.2", 3, 1, 512); fpn1 = resize(network, fpn1,fpn0->getDimensions().d[1]); fpn2 = resize(network, fpn2,fpn0->getDimensions().d[1]); auto fpn3 = resize(network, out3, fpn0->getDimensions().d[1]); ITensor* fpn_outs[4] = {fpn0, fpn1, fpn2, fpn3}; auto FPN_outs = network->addConcatenation(fpn_outs, 4); FPN_outs->setAxis(0); debug_print(FPN_outs->getOutput(0), "FPN_outs"); auto fpn_output = transform_lateral_conv(network, weightMap, FPN_outs->getOutput(0), "decode_head.fpn_bottleneck", 3, 1, 512); debug_print(fpn_output, "fpn_output"); auto seg = network->addConvolutionNd(*fpn_output, NUM_CLASSES, Dims2{1, 1}, weightMap["decode_head.conv_seg.weight"], weightMap["decode_head.conv_seg.bias"]); seg->setStrideNd(Dims2{1, 1}); debug_print(seg->getOutput(0), "seg"); auto seg_resize = resize(network, seg->getOutput(0), INPUT_H); debug_print(seg_resize, "seg_resize"); auto output = network->addTopK(*seg_resize, TopKOperation::kMAX, 1, 0X01)->getOutput(1); debug_print(output, "output"); std::cout << "set name out" << std::endl; output->setName(OUTPUT_BLOB_NAME); network->markOutput(*output); builder->setMaxBatchSize(12); config->setMaxWorkspaceSize((1 << 30)); // 1G #ifdef USE_FP16 std::cout<< "use fp16"<setFlag(BuilderFlag::kFP16); #endif ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build success!" << std::endl; network->destroy(); return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream,std::string wtsPath) { IBuilder *builder = createInferBuilder(gLogger); IBuilderConfig *config = builder->createBuilderConfig(); ICudaEngine *engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wtsPath); assert(engine != nullptr); (*modelStream) = engine->serialize(); engine->destroy(); builder->destroy(); } void createEng(std::string wtsPath, std::string engine_name) { char *trtModelStream{nullptr}; size_t size{0}; IHostMemory *modelStream{nullptr}; APIToModel(bs, &modelStream, wtsPath); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } void inference_init(string ENGPath,ICudaEngine *m_engine) { ifstream cache(ENGPath, ios::binary); cache.seekg(0, ios::end); const int engSize = cache.tellg(); cache.seekg(0, ios::beg); void *modelMem = malloc(engSize); cache.read((char*)modelMem, engSize); cache.close(); IRuntime *runtime = nvinfer1::createInferRuntime(gLogger); m_engine = runtime->deserializeCudaEngine(modelMem, engSize); runtime->destroy(); free(modelMem); if (!m_engine) { cout << "deserialize eng error!" << endl; return; } m_context = m_engine->createExecutionContext(); if (cudaStreamCreate(&m_cudaStream) != 0) return; int bindings = m_engine->getNbBindings(); if (bindings < 2) { cout << "Error! the network have one input and one output at least!" << endl; return; } cout << "1111111111111" << endl; m_bindings.resize(bindings, nullptr); CHECK(cudaMalloc(&m_bindings.at(0), bs * ch * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&m_bindings.at(1), bs * outputSize * 4)); } void doInference(const float *input, int *output) { cout << "do infer:" << endl; CHECK(cudaMemcpyAsync(m_bindings.at(0), input, bs * ch * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, m_cudaStream)); m_context->enqueue(bs, m_bindings.data(), m_cudaStream, nullptr); CHECK(cudaMemcpyAsync(output, m_bindings.at(1), bs * outputSize * 4, cudaMemcpyDeviceToHost, m_cudaStream)); cudaStreamSynchronize(m_cudaStream); } int main(int argc, char** argv) { cout << "begin" << endl; //string wts = "G:/shaj/trainsform/ktn5n6_29.511.21.8.wts"; //string eng = "G:/shaj/trainsform/trainsform.eng"; std::string argv1 = argv[1]; if (argv1 == "-s") { string wts = argv[2]; string eng = argv[3]; createEng(wts,eng); } else { string eng = argv[2]; ICudaEngine *m_engine; inference_init(eng,m_engine); vector testVal; map dataProb; vector imgs; cv::Mat img; //string pattern_dir = "G:/shaj/trainsform"; string pattern_dir = argv[3]; string pattern = pattern_dir+ "/*.bmp"; vector images_names; cv::glob(pattern, images_names, false); int i = 0; cv::Scalar Mean = cv::Scalar(123.675, 116.28, 103.53); cv::Scalar Std = cv::Scalar(58.395, 57.12, 57.375); cv::Size size = {INPUT_H,INPUT_W}; for (auto image_name: images_names) { if (i < bs) { cv::Mat Img = cv::imread(image_name, 1); testVal.push_back(Img); cout << image_name << endl; imgs.push_back(image_name); } } float *data = new float[bs * ch * INPUT_H * INPUT_W]; int *output = new int[bs * outputSize]; cv::Mat Transed_t = BlobFromImages(testVal, cv::Size{INPUT_H, INPUT_W}, Mean, Std, true, false); memcpy(data, Transed_t.data, bs * ch * INPUT_H * INPUT_W * sizeof(float)); //for(int i = 0 ; i< 20; i++){ auto start_time = std::chrono::system_clock::now(); doInference(data, output); auto end_time = std::chrono::system_clock::now(); float duration; duration = std::chrono::duration_cast(end_time - start_time).count(); cout << "time:" << duration << endl; //} // for(int i = 0; i < 100; i++) // cout<(h, w)[0] = out[i] * 10; dst.at(h, w)[1] = out[i] * 30; dst.at(h, w)[2] = out[i] * 40; } } //cout<destroy(); m_engine->destroy(); for (auto bindings: m_bindings) { cudaFree(bindings); } cudaFree(m_cudaStream); cout << "swin_transform" << endl; return 0; } ================================================ FILE: swin-transformer/semantic-segmentation/utilsn.h ================================================ #ifndef UTILSN_H #define UTILSN_H #include #include #include #include #include #include "myhpp.h" using namespace std; #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { class Profiler : public nvinfer1::IProfiler { public: void printLayerTimes(int itrationsTimes) { float totalTime = 0; for (size_t i = 0; i < mProfile.size(); i++) { printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); totalTime += mProfile[i].second; } printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); } private: typedef std::pair Record; std::vector mProfile; virtual void reportLayerTime(const char* layerName, float ms) { auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); if (record == mProfile.end()) { mProfile.push_back(std::make_pair(layerName, ms));} else record->second += ms; } }; template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } // void* copyToDevice(const void* data, size_t count) // { // void* deviceData; // cudaMalloc(&deviceData, count); // cudaMemcpy(deviceData, data, count, cudaMemcpyHostToDevice); // return deviceData; // } // void deserializeToDevice(const char*& hostBuffer, void*& deviceWeights, size_t size) // { // deviceWeights = copyToDevice(hostBuffer, size); // hostBuffer += size; // } // size_t type2size(nvinfer1::DataType type) { return sizeof(float); } // void convertAndCopyToBuffer(char*& buffer, const nvinfer1::Weights& weights) // { // memcpy(buffer, weights.values, weights.count * type2size(weights.type)); // buffer += weights.count * type2size(weights.type); // } } #endif // UTILSN_H ================================================ FILE: tsm/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(TSM) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/home/ubuntu/TensorRT/include/) link_directories(/home/ubuntu/TensorRT/lib/) add_executable(tsm_r50 ${PROJECT_SOURCE_DIR}/tsm_r50.cpp) target_link_libraries(tsm_r50 nvinfer) target_link_libraries(tsm_r50 cudart) add_definitions(-O2 -pthread) ================================================ FILE: tsm/README.md ================================================ # Temporal Shift Module TSM-R50 from "TSM: Temporal Shift Module for Efficient Video Understanding" TSM is a widely used Action Recognition model. This TensorRT implementation is tested with TensorRT 5.1 and TensorRT 7.2. For the PyTorch implementation, you can refer to [open-mmlab/mmaction2](https://github.com/open-mmlab/mmaction2) or [mit-han-lab/temporal-shift-module](https://github.com/mit-han-lab/temporal-shift-module). More details about the shift module(which is the core of TSM) could to [test_shift.py](./test_shift.py). ## Tutorial + An example could refer to [demo.sh](./demo.sh) + Requirements: Successfully installed `torch>=1.3.0, torchvision` + Step 1: Train/Download TSM-R50 checkpoints from [offical Github repo](https://github.com/mit-han-lab/temporal-shift-module) or [MMAction2](https://github.com/open-mmlab/mmaction2) + Supported settings: `num_segments`, `shift_div`, `num_classes`. + Fixed settings: `backbone`(ResNet50), `shift_place`(blockres), `temporal_pool`(False). + Step 2: Convert PyTorch checkpoints to TensorRT weights. ```shell python gen_wts.py /path/to/pytorch.pth --out-filename /path/to/tensorrt.wts ``` + Step 3: Test Python API. + Modify configs in `tsm_r50.py`. + Inference with `tsm_r50.py`. ```python # Supported settings BATCH_SIZE = 1 NUM_SEGMENTS = 8 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 400 SHIFT_DIV = 8 ``` ```shell usage: tsm_r50.py [-h] [--tensorrt-weights TENSORRT_WEIGHTS] [--input-video INPUT_VIDEO] [--save-engine-path SAVE_ENGINE_PATH] [--load-engine-path LOAD_ENGINE_PATH] [--test-mmaction2] [--mmaction2-config MMACTION2_CONFIG] [--mmaction2-checkpoint MMACTION2_CHECKPOINT] [--test-cpp] [--cpp-result-path CPP_RESULT_PATH] optional arguments: -h, --help show this help message and exit --tensorrt-weights TENSORRT_WEIGHTS Path to TensorRT weights, which is generated by gen_weights.py --input-video INPUT_VIDEO Path to local video file --save-engine-path SAVE_ENGINE_PATH Save engine to local file --load-engine-path LOAD_ENGINE_PATH Saved engine file path --test-mmaction2 Compare TensorRT results with MMAction2 Results --mmaction2-config MMACTION2_CONFIG Path to MMAction2 config file --mmaction2-checkpoint MMACTION2_CHECKPOINT Path to MMAction2 checkpoint url or file path --test-cpp Compare Python API results with C++ API results --cpp-result-path CPP_RESULT_PATH Path to C++ API results ``` + Step 4: Test C++ API. + Mocify Configs in `tsm_r50.cpp`. + Build from source code: `mkdir build && cd build && cmake .. && make` + Generate Engine file: `./tsm_r50 -s` + Inference with genrated engine file and write predictions to local: `./tsm_r50 -d` + Compare results with Python API: `python tsm_r50.py --tensorrt-weights /path/to/tensorrt.weights --test-cpp --cpp-result-file /path/to/cpp-result.txt` ## TODO + [x] Python Shift module. + [x] Generate wts of official tsm and mmaction2 tsm. + [x] Python API Definition + [x] Test with mmaction2 demo + [x] Tutorial + [x] C++ API Definition ================================================ FILE: tsm/demo.sh ================================================ # Step 1: Get checkpoints from mmaction2 # https://github.com/open-mmlab/mmaction2/tree/master/configs/recognition/tsm wget https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth # Step 2: Convert pytorch checkpoints to TensorRT weights python gen_wts.py tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth --out-filename ./tsm_r50_kinetics400_mmaction2.wts # Step 3: Test Python API. # 3.1 Skip this step since we use default settings. # 3.2 Inference # 3.2.1 Save local engine file to `./tsm_r50_kinetics400_mmaction2.trt`. python tsm_r50.py \ --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts \ --save-engine-path ./tsm_r50_kinetics400_mmaction2.trt # 3.2.2 Predict the recognition result using a single video `demo.mp4`. # Should print `Result class id 6`, aka `arm wrestling` # Download demo video wget https://raw.githubusercontent.com/open-mmlab/mmaction2/master/demo/demo.mp4 # # use *.wts as input # python tsm_r50.py --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts \ # --input-video ./demo.mp4 # use engine file as input python tsm_r50.py --load-engine-path ./tsm_r50_kinetics400_mmaction2.trt \ --input-video ./demo.mp4 # 3.2.3 Optional: Compare inference result with MMAction2 TSM-R50 model # Have to install MMAction2 First, please refer to https://github.com/open-mmlab/mmaction2/blob/master/docs/install.md # pip3 install pytest-runner # pip3 install mmcv # pip3 install mmaction2 # # use *.wts as input # python tsm_r50.py \ # --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts \ # --test-mmaction2 \ # --mmaction2-config mmaction2_tsm_r50_config.py \ # --mmaction2-checkpoint tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth # # use TensorRT engine as input # python tsm_r50.py \ # --load-engine-path ./tsm_r50_kinetics400_mmaction2.trt \ # --test-mmaction2 \ # --mmaction2-config mmaction2_tsm_r50_config.py \ # --mmaction2-checkpoint tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth # Step 4: Test Python API. # 4.1 Skip this step since we use default settings. # 4.2 Build CPP mkdir build && cd build && cmake .. && make # 4.3 Generate Engine file ./tsm_r50 -s # 4.4 Get Predictions ./tsm_r50 -d # 4.5 Compare C++ Results with Python Results cd .. python tsm_r50.py --test-cpp --tensorrt-weights ./tsm_r50_kinetics400_mmaction2.wts ================================================ FILE: tsm/gen_wts.py ================================================ import argparse import struct import torch import numpy as np def write_one_weight(writer, name, weight): assert isinstance(weight, np.ndarray) values = weight.reshape(-1) writer.write('{} {}'.format(name, len(values))) for value in values: writer.write(' ') # float to bytes to hex_string writer.write(struct.pack('>f', float(value)).hex()) writer.write('\n') def convert_name(name): return name.replace("module.", "").replace("base_model.", "").\ replace("net.", "").replace("new_fc", "fc").replace("backbone.", "").\ replace("cls_head.fc_cls", "fc").replace(".conv.", ".").\ replace("conv1.bn", "bn1").replace("conv2.bn", "bn2").\ replace("conv3.bn", "bn3").replace("downsample.bn", "downsample.1").\ replace("downsample.weight", "downsample.0.weight") def main(args): ckpt = torch.load(args.checkpoint)['state_dict'] ckpt = {k: v for k, v in ckpt.items() if 'num_batches_tracked' not in k} with open(args.out_filename, "w") as f: f.write(f"{len(ckpt)}\n") for k, v in ckpt.items(): key = convert_name(k) write_one_weight(f, key, v.cpu().numpy()) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("checkpoint", type=str, help="Path to checkpoint file") parser.add_argument("--out-filename", type=str, default="tsm_r50.wts", help="Path to converted wegiths file") args = parser.parse_args() main(args) ================================================ FILE: tsm/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: tsm/mmaction2_tsm_r50_config.py ================================================ # model settings model = dict( type='Recognizer2D', backbone=dict( type='ResNetTSM', pretrained='torchvision://resnet50', depth=50, norm_eval=False, shift_div=8), cls_head=dict( type='TSMHead', num_classes=400, in_channels=2048, spatial_type='avg', consensus=dict(type='AvgConsensus', dim=1), dropout_ratio=0.5, init_std=0.001, is_shift=True), # model training and testing settings train_cfg=None, test_cfg=dict(average_clips='prob')) ================================================ FILE: tsm/test_shift.py ================================================ import numpy as np import pycuda.autoinit # noqa import pycuda.driver as cuda import tensorrt as trt import torch from numpy.testing import assert_array_almost_equal INPUT_BLOB_NAME = 'input' OUTPUT_BLOB_NAME = 'output' def shift_mit(x, num_segments, shift_div=8): """Official temporal shift module. Code Reference: https://github.com/mit-han-lab/temporal-shift-module/blob/master/ops/temporal_shift.py # noqa Cannot convert to ONNX Model. """ nt, c, h, w = x.size() n_batch = nt // num_segments x = x.view(n_batch, num_segments, c, h, w) fold = c // shift_div out = torch.zeros_like(x) out[:, :-1, :fold] = x[:, 1:, :fold] # shift left out[:, 1:, fold:2 * fold] = x[:, :-1, fold:2 * fold] # shift right out[:, :, 2 * fold:] = x[:, :, 2 * fold:] # not shift return out.view(nt, c, h, w) def shift_mmaction2(x, num_segments, shift_div=8): """MMAction2 temporal shift module. Code Reference: https://github.com/open-mmlab/mmaction2/blob/master/mmaction/models/backbones/resnet_tsm.py # noqa Could convert to ONNX Model. """ # [N, C, H, W] n, c, h, w = x.size() # [N // num_segments, num_segments, C, H*W] # can't use 5 dimensional array on PPL2D backend for caffe x = x.view(-1, num_segments, c, h * w) # get shift fold fold = c // shift_div # split c channel into three parts: # left_split, mid_split, right_split left_split = x[:, :, :fold, :] mid_split = x[:, :, fold:2 * fold, :] right_split = x[:, :, 2 * fold:, :] # can't use torch.zeros(*A.shape) or torch.zeros_like(A) # because array on caffe inference must be got by computing # shift left on num_segments channel in `left_split` zeros = left_split - left_split blank = zeros[:, :1, :, :] left_split = left_split[:, 1:, :, :] left_split = torch.cat((left_split, blank), 1) # shift right on num_segments channel in `mid_split` zeros = mid_split - mid_split blank = zeros[:, :1, :, :] mid_split = mid_split[:, :-1, :, :] mid_split = torch.cat((blank, mid_split), 1) # right_split: no shift # concatenate out = torch.cat((left_split, mid_split, right_split), 2) # [N, C, H, W] # restore the original dimension return out.view(n, c, h, w) def _tensorrt_shift_module(network, input, num_segments=8, shift_div=8, input_shape=(16, 64, 32, 32)): """Temporal shift module implemented by TensorRT Network Definition API.""" fold = input_shape[1] // shift_div batch_size = input_shape[0] // num_segments # reshape reshape = network.add_shuffle(input) assert reshape reshape.reshape_dims = (batch_size, num_segments) + tuple(input_shape[-3:]) # left left_split = network.add_slice(reshape.get_output(0), start=(0, 1, 0, 0, 0), shape=(batch_size, num_segments - 1, fold, input_shape[2], input_shape[3]), stride=(1, 1, 1, 1, 1)) assert left_split left_split_shape = (batch_size, 1, fold, input_shape[2], input_shape[3]) left_blank = network.add_constant(shape=left_split_shape, weights=np.zeros(left_split_shape, np.float32)) assert left_blank left = network.add_concatenation( [left_split.get_output(0), left_blank.get_output(0)]) assert left left.axis = 1 # mid mid_split_shape = (batch_size, 1, fold, input_shape[2], input_shape[3]) mid_blank = network.add_constant(shape=mid_split_shape, weights=np.zeros(mid_split_shape, np.float32)) assert mid_blank mid_split = network.add_slice(reshape.get_output(0), start=(0, 0, fold, 0, 0), shape=(batch_size, num_segments - 1, fold, input_shape[2], input_shape[3]), stride=(1, 1, 1, 1, 1)) assert mid_split mid = network.add_concatenation( [mid_blank.get_output(0), mid_split.get_output(0)]) assert mid mid.axis = 1 # right right = network.add_slice(reshape.get_output(0), start=(0, 0, 2 * fold, 0, 0), shape=(batch_size, num_segments, input_shape[1] - 2 * fold, input_shape[2], input_shape[3]), stride=(1, 1, 1, 1, 1)) # concat concat = network.add_concatenation( [left.get_output(0), mid.get_output(0), right.get_output(0)]) assert concat concat.axis = 2 # reshape reshape2 = network.add_shuffle(concat.get_output(0)) assert reshape2 reshape2.reshape_dims = input_shape return reshape2 def shift_tensorrt(x, num_segments, shift_div, input_shape): """Test TensorRT temporal shift module.""" assert isinstance(x, np.ndarray) gLogger = trt.Logger(trt.Logger.INFO) builder = trt.Builder(gLogger) config = builder.create_builder_config() # create engine explicit_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) network = builder.create_network(explicit_flag) input = network.add_input(INPUT_BLOB_NAME, trt.float32, input_shape) assert input output = _tensorrt_shift_module(network, input, num_segments=num_segments, shift_div=shift_div, input_shape=input_shape) assert output # generate engine by builder/network/config output.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(output.get_output(0)) builder.max_batch_size = 1 builder.max_workspace_size = 1 << 20 engine = builder.build_engine(network, config) del network assert engine.num_bindings == 2, f'{engine.num_bindings}' context = engine.create_execution_context() # buffer host_in = cuda.pagelocked_empty(trt.volume(input_shape), dtype=np.float32) np.copyto(host_in, x.ravel()) host_out = cuda.pagelocked_empty(trt.volume(input_shape), dtype=np.float32) devide_in = cuda.mem_alloc(host_in.nbytes) devide_out = cuda.mem_alloc(host_out.nbytes) bindings = [int(devide_in), int(devide_out)] stream = cuda.Stream() # do inference cuda.memcpy_htod_async(devide_in, host_in, stream) context.execute_async(bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_out, devide_out, stream) stream.synchronize() return np.array(host_out.reshape(*input_shape)) if __name__ == '__main__': INPUT_SHAPE = (16, 64, 32, 32) assert len(INPUT_SHAPE) == 4 NUM_SEGMENTS = 8 SHIFT_DIV = 8 # inference inputs = np.random.rand(*INPUT_SHAPE).astype(np.float32) inputs_pytorch = torch.tensor(inputs) with torch.no_grad(): rmit = shift_mit(inputs_pytorch, NUM_SEGMENTS, SHIFT_DIV).numpy() rmmaction2 = shift_mmaction2(inputs_pytorch, NUM_SEGMENTS, SHIFT_DIV).numpy() rtensorrt = shift_tensorrt(inputs, NUM_SEGMENTS, SHIFT_DIV, INPUT_SHAPE) # test results assert_array_almost_equal(rmit, rtensorrt) assert_array_almost_equal(rmmaction2, rtensorrt) print("Tests PASSED") ================================================ FILE: tsm/tsm_r50.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include #include #include #include #include #include #include #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 400; static const int NUM_SEGMENTS = 8; static const int SHIFT_DIV = 8; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; const char* WEIGHTS_PATH = "../tsm_r50_kinetics400_mmaction2.wts"; const char* ENGINE_PATH = "./tsm_r50_kinetics400_mmaction2_cpp.trt"; const char* RESULT_PATH = "./result.txt"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } void print(char* name, ITensor* tensor) { Dims dim = tensor->getDimensions(); std::cout << name << " " << dim.d[0] << " " << dim.d[1] << " " << dim.d[2] << " " << dim.d[3] <(malloc(sizeof(zeros) * fold*inputShape.d[2]*inputShape.d[3])); memset(zeros, 0, sizeof(zeros) * fold*inputShape.d[2]*inputShape.d[3]); Weights zeros_weights{DataType::kFLOAT, zeros, fold*inputShape.d[2]*inputShape.d[3]}; // left ISliceLayer* left1 = network->addSlice(input, Dims4{1, 0, 0, 0}, Dims4{numSegments - 1, fold, inputShape.d[2], inputShape.d[3]}, Dims4{1, 1, 1, 1}); IConstantLayer* left2 = network->addConstant(Dims4{1, fold, inputShape.d[2], inputShape.d[3]}, zeros_weights); ITensor* tensorsLeft[] = {left1->getOutput(0), left2->getOutput(0)}; IConcatenationLayer* left = network->addConcatenation(tensorsLeft, 2); left->setAxis(0); // mid IConstantLayer* mid1 = network->addConstant(Dims4{1, fold, inputShape.d[2], inputShape.d[3]}, zeros_weights); ISliceLayer* mid2 = network->addSlice(input, Dims4{0, fold, 0, 0}, Dims4{numSegments - 1, fold, inputShape.d[2], inputShape.d[3]}, Dims4{1, 1, 1, 1}); ITensor* tensorsMid[] = {mid1->getOutput(0), mid2->getOutput(0)}; IConcatenationLayer* mid = network->addConcatenation(tensorsMid, 2); mid->setAxis(0); // right ISliceLayer* right = network->addSlice(input, Dims4{0, 2 * fold, 0, 0}, Dims4{numSegments, inputShape.d[1] - 2 * fold, inputShape.d[2], inputShape.d[3]}, Dims4{1, 1, 1, 1}); // concatenate left/mid/right ITensor* tensors[] = {left->getOutput(0), mid->getOutput(0), right->getOutput(0)}; IConcatenationLayer* concat = network->addConcatenation(tensors, 3); concat->setAxis(1); return concat; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IActivationLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname, Dims4 inputShape) { IConcatenationLayer* shift = addShift(network, input, inputShape, NUM_SEGMENTS, SHIFT_DIV); assert(shift); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolution(*shift->getOutput(0), outch, DimsHW{1, 1}, weightMap[lname + "conv1.weight"], emptywts); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolution(*relu1->getOutput(0), outch, DimsHW{3, 3}, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setStride(DimsHW{stride, stride}); conv2->setPadding(DimsHW{1, 1}); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv3 = network->addConvolution(*relu2->getOutput(0), outch * 4, DimsHW{1, 1}, weightMap[lname + "conv3.weight"], emptywts); assert(conv3); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "bn3", 1e-5); IElementWiseLayer* ew1; if (stride != 1 || inch != outch * 4) { IConvolutionLayer* conv4 = network->addConvolution(input, outch * 4, DimsHW{1, 1}, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv4); conv4->setStride(DimsHW{stride, stride}); IScaleLayer* bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn4->getOutput(0), *bn3->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn3->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu3 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu3); return relu3; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, DataType dt) { INetworkDefinition* network = builder->createNetwork(); // Create input tensor of shape {NUM_SEGMENTS, 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{NUM_SEGMENTS, 3, INPUT_H, INPUT_W}); assert(data); print("input", data); std::map weightMap = loadWeights(WEIGHTS_PATH); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolution(*data, 64, DimsHW{7, 7}, weightMap["conv1.weight"], emptywts); assert(conv1); conv1->setStride(DimsHW{2, 2}); conv1->setPadding(DimsHW{3, 3}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "bn1", 1e-5); // Add activation layer using the ReLU algorithm. IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); // Add max pooling layer with stride of 2x2 and kernel size of 2x2. IPoolingLayer* pool1 = network->addPooling(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); assert(pool1); pool1->setStride(DimsHW{2, 2}); pool1->setPadding(DimsHW{1, 1}); int curHeight = int(INPUT_H / 4); int curWidth = int(INPUT_W / 4); IActivationLayer* x = bottleneck(network, weightMap, *pool1->getOutput(0), 64, 64, 1, "layer1.0.", Dims4{NUM_SEGMENTS, 64, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.1.", Dims4{NUM_SEGMENTS, 256, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 64, 1, "layer1.2.", Dims4{NUM_SEGMENTS, 256, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 256, 128, 2, "layer2.0.", Dims4{NUM_SEGMENTS, 256, curHeight, curWidth}); curHeight = int(INPUT_H / 8); curWidth = int(INPUT_W / 8); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.1.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.2.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 128, 1, "layer2.3.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 512, 256, 2, "layer3.0.", Dims4{NUM_SEGMENTS, 512, curHeight, curWidth}); curHeight = int(INPUT_H / 16); curWidth = int(INPUT_W / 16); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.1.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.2.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.3.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.4.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 256, 1, "layer3.5.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 1024, 512, 2, "layer4.0.", Dims4{NUM_SEGMENTS, 1024, curHeight, curWidth}); curHeight = int(INPUT_H / 32); curWidth = int(INPUT_W / 32); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.1.", Dims4{NUM_SEGMENTS, 2048, curHeight, curWidth}); x = bottleneck(network, weightMap, *x->getOutput(0), 2048, 512, 1, "layer4.2.", Dims4{NUM_SEGMENTS, 2048, curHeight, curWidth}); IPoolingLayer* pool2 = network->addPooling(*x->getOutput(0), PoolingType::kAVERAGE, DimsHW{curHeight, curWidth}); assert(pool2); pool2->setStride(DimsHW{1, 1}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), OUTPUT_SIZE, weightMap["fc.weight"], weightMap["fc.bias"]); assert(fc1); IReduceLayer* reduce = network->addReduce(*fc1->getOutput(0), ReduceOperation::kAVG, 1, false); assert(reduce); ISoftMaxLayer* softmax = network->addSoftMax(*reduce->getOutput(0)); assert(softmax); softmax->setAxes(1); softmax->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*softmax->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); ICudaEngine* engine = builder->buildCudaEngine(*network); // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * NUM_SEGMENTS * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * NUM_SEGMENTS * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./tsm_r50 -s // serialize model to plan file" << std::endl; std::cerr << "./tsm_r50 -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } // Subtract mean from image static float data[NUM_SEGMENTS * 3 * INPUT_H * INPUT_W]; for (int i = 0; i < NUM_SEGMENTS * 3 * INPUT_H * INPUT_W; i++) data[i] = 1.0; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; doInference(*context, data, prob, 1); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[i] << ", "; } std::cout << std::endl; for (unsigned int i = 0; i < 10; i++) { std::cout << prob[OUTPUT_SIZE - 10 + i] << ", "; } std::cout << std::endl; std::fstream writer(RESULT_PATH, std::ios::out); writer << prob[0]; for(int i = 1; i < OUTPUT_SIZE ; i++) { writer << " " << prob[i]; } writer.close(); return 0; } ================================================ FILE: tsm/tsm_r50.py ================================================ import argparse import os import struct import numpy as np import pycuda.autoinit # noqa import pycuda.driver as cuda import tensorrt as trt BATCH_SIZE = 1 NUM_SEGMENTS = 8 INPUT_H = 224 INPUT_W = 224 OUTPUT_SIZE = 400 SHIFT_DIV = 8 assert INPUT_H % 32 == 0 and INPUT_W % 32 == 0, \ "Input height and width should be a multiple of 32." EPS = 1e-5 INPUT_BLOB_NAME = "data" OUTPUT_BLOB_NAME = "prob" TRT_LOGGER = trt.Logger(trt.Logger.INFO) def load_weights(file): print(f"Loading weights: {file}") assert os.path.exists(file), f'Unable to load weight file {file}' weight_map = {} with open(file, "r") as f: lines = [line.strip() for line in f] count = int(lines[0]) assert count == len(lines) - 1 for i in range(1, count + 1): splits = lines[i].split(" ") name = splits[0] cur_count = int(splits[1]) assert cur_count + 2 == len(splits) values = [] for j in range(2, len(splits)): # hex string to bytes to float values.append(struct.unpack(">f", bytes.fromhex(splits[j]))) weight_map[name] = np.array(values, dtype=np.float32) return weight_map def add_shift_module(network, input, input_shape, num_segments=8, shift_div=8): fold = input_shape[1] // shift_div # left left_split = network.add_slice(input, start=(1, 0, 0, 0), shape=(num_segments - 1, fold, input_shape[2], input_shape[3]), stride=(1, 1, 1, 1)) assert left_split left_split_shape = (1, fold, input_shape[2], input_shape[3]) left_blank = network.add_constant(shape=left_split_shape, weights=np.zeros(left_split_shape, np.float32)) assert left_blank left = network.add_concatenation( [left_split.get_output(0), left_blank.get_output(0)]) assert left left.axis = 0 # mid mid_split_shape = (1, fold, input_shape[2], input_shape[3]) mid_blank = network.add_constant(shape=mid_split_shape, weights=np.zeros(mid_split_shape, np.float32)) assert mid_blank mid_split = network.add_slice(input, start=(0, fold, 0, 0), shape=(num_segments - 1, fold, input_shape[2], input_shape[3]), stride=(1, 1, 1, 1)) assert mid_split mid = network.add_concatenation( [mid_blank.get_output(0), mid_split.get_output(0)]) assert mid mid.axis = 0 # right right = network.add_slice(input, start=(0, 2 * fold, 0, 0), shape=(num_segments, input_shape[1] - 2 * fold, input_shape[2], input_shape[3]), stride=(1, 1, 1, 1)) # concat left mid right output = network.add_concatenation( [left.get_output(0), mid.get_output(0), right.get_output(0)]) assert output output.axis = 1 return output def add_batch_norm_2d(network, weight_map, input, layer_name, eps): gamma = weight_map[layer_name + ".weight"] beta = weight_map[layer_name + ".bias"] mean = weight_map[layer_name + ".running_mean"] var = weight_map[layer_name + ".running_var"] var = np.sqrt(var + eps) scale = gamma / var shift = -mean / var * gamma + beta return network.add_scale(input=input, mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale) def bottleneck(network, weight_map, input, in_channels, out_channels, stride, layer_name, input_shape): shift = add_shift_module(network, input, input_shape, NUM_SEGMENTS, SHIFT_DIV) assert shift conv1 = network.add_convolution(input=shift.get_output(0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=weight_map[layer_name + "conv1.weight"], bias=trt.Weights()) assert conv1 bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), layer_name + "bn1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 conv2 = network.add_convolution(input=relu1.get_output(0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=weight_map[layer_name + "conv2.weight"], bias=trt.Weights()) assert conv2 conv2.stride = (stride, stride) conv2.padding = (1, 1) bn2 = add_batch_norm_2d(network, weight_map, conv2.get_output(0), layer_name + "bn2", EPS) assert bn2 relu2 = network.add_activation(bn2.get_output(0), type=trt.ActivationType.RELU) assert relu2 conv3 = network.add_convolution(input=relu2.get_output(0), num_output_maps=out_channels * 4, kernel_shape=(1, 1), kernel=weight_map[layer_name + "conv3.weight"], bias=trt.Weights()) assert conv3 bn3 = add_batch_norm_2d(network, weight_map, conv3.get_output(0), layer_name + "bn3", EPS) assert bn3 if stride != 1 or in_channels != 4 * out_channels: conv4 = network.add_convolution( input=input, num_output_maps=out_channels * 4, kernel_shape=(1, 1), kernel=weight_map[layer_name + "downsample.0.weight"], bias=trt.Weights()) assert conv4 conv4.stride = (stride, stride) bn4 = add_batch_norm_2d(network, weight_map, conv4.get_output(0), layer_name + "downsample.1", EPS) assert bn4 ew1 = network.add_elementwise(bn4.get_output(0), bn3.get_output(0), trt.ElementWiseOperation.SUM) else: ew1 = network.add_elementwise(input, bn3.get_output(0), trt.ElementWiseOperation.SUM) assert ew1 relu3 = network.add_activation(ew1.get_output(0), type=trt.ActivationType.RELU) assert relu3 return relu3 def create_engine(maxBatchSize, builder, dt, weights): weight_map = load_weights(weights) network = builder.create_network() data = network.add_input(INPUT_BLOB_NAME, dt, (NUM_SEGMENTS, 3, INPUT_H, INPUT_W)) assert data conv1 = network.add_convolution(input=data, num_output_maps=64, kernel_shape=(7, 7), kernel=weight_map["conv1.weight"], bias=trt.Weights()) assert conv1 conv1.stride = (2, 2) conv1.padding = (3, 3) bn1 = add_batch_norm_2d(network, weight_map, conv1.get_output(0), "bn1", EPS) assert bn1 relu1 = network.add_activation(bn1.get_output(0), type=trt.ActivationType.RELU) assert relu1 pool1 = network.add_pooling(input=relu1.get_output(0), window_size=trt.DimsHW(3, 3), type=trt.PoolingType.MAX) assert pool1 pool1.stride = (2, 2) pool1.padding = (1, 1) cur_height = INPUT_H // 4 cur_width = INPUT_W // 4 x = bottleneck(network, weight_map, pool1.get_output(0), 64, 64, 1, "layer1.0.", (NUM_SEGMENTS, 64, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1, "layer1.1.", (NUM_SEGMENTS, 256, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 256, 64, 1, "layer1.2.", (NUM_SEGMENTS, 256, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 256, 128, 2, "layer2.0.", (NUM_SEGMENTS, 256, cur_height, cur_width)) cur_height = INPUT_H // 8 cur_width = INPUT_W // 8 x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1, "layer2.1.", (NUM_SEGMENTS, 512, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1, "layer2.2.", (NUM_SEGMENTS, 512, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 512, 128, 1, "layer2.3.", (NUM_SEGMENTS, 512, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 512, 256, 2, "layer3.0.", (NUM_SEGMENTS, 512, cur_height, cur_width)) cur_height = INPUT_H // 16 cur_width = INPUT_W // 16 x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.1.", (NUM_SEGMENTS, 1024, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.2.", (NUM_SEGMENTS, 1024, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.3.", (NUM_SEGMENTS, 1024, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.4.", (NUM_SEGMENTS, 1024, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 1024, 256, 1, "layer3.5.", (NUM_SEGMENTS, 1024, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 1024, 512, 2, "layer4.0.", (NUM_SEGMENTS, 1024, cur_height, cur_width)) cur_height = INPUT_H // 32 cur_width = INPUT_W // 32 x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1, "layer4.1.", (NUM_SEGMENTS, 2048, cur_height, cur_width)) x = bottleneck(network, weight_map, x.get_output(0), 2048, 512, 1, "layer4.2.", (NUM_SEGMENTS, 2048, cur_height, cur_width)) pool2 = network.add_pooling(x.get_output(0), window_size=trt.DimsHW(cur_height, cur_width), type=trt.PoolingType.AVERAGE) assert pool2 pool2.stride = (1, 1) fc1 = network.add_fully_connected(input=pool2.get_output(0), num_outputs=OUTPUT_SIZE, kernel=weight_map['fc.weight'], bias=weight_map['fc.bias']) assert fc1 reshape = network.add_shuffle(fc1.get_output(0)) assert reshape reshape.reshape_dims = (NUM_SEGMENTS, OUTPUT_SIZE) reduce = network.add_reduce(reshape.get_output(0), op=trt.ReduceOperation.AVG, axes=1, keep_dims=False) assert reduce softmax = network.add_softmax(reduce.get_output(0)) assert softmax softmax.axes = 1 softmax.get_output(0).name = OUTPUT_BLOB_NAME network.mark_output(softmax.get_output(0)) # Build engine builder.max_batch_size = maxBatchSize builder.max_workspace_size = 1 << 20 engine = builder.build_cuda_engine(network) del network del weight_map return engine def do_inference(context, host_in, host_out, batchSize): devide_in = cuda.mem_alloc(host_in.nbytes) devide_out = cuda.mem_alloc(host_out.nbytes) bindings = [int(devide_in), int(devide_out)] stream = cuda.Stream() cuda.memcpy_htod_async(devide_in, host_in, stream) context.execute_async(batch_size=batchSize, bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_out, devide_out, stream) stream.synchronize() def inference_mmaction2(inputs, config, checkpoint): import torch from mmaction.models import build_model from mmcv import Config from mmcv.runner import load_checkpoint cfg = Config.fromfile(config) cfg.model.backbone.pretrained = None model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg')) load_checkpoint(model, checkpoint, map_location='cpu') model.eval() inputs = torch.tensor(inputs) with torch.no_grad(): return model(return_loss=False, imgs=inputs) def main(args): assert not (args.save_engine_path and args.load_engine_path) if args.load_engine_path: # load from local file runtime = trt.Runtime(TRT_LOGGER) assert runtime with open(args.load_engine_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) else: # Create network and engine assert args.tensorrt_weights builder = trt.Builder(TRT_LOGGER) engine = create_engine(BATCH_SIZE, builder, trt.float32, args.tensorrt_weights) assert engine assert engine.num_bindings == 2 if args.save_engine_path is not None: # save engine to local file with open(args.save_engine_path, "wb") as f: f.write(engine.serialize()) print(f"{args.save_engine_path} Generated successfully.") context = engine.create_execution_context() assert context host_in = cuda.pagelocked_empty(BATCH_SIZE * NUM_SEGMENTS * 3 * INPUT_H * INPUT_W, dtype=np.float32) host_out = cuda.pagelocked_empty(BATCH_SIZE * OUTPUT_SIZE, dtype=np.float32) if args.test_mmaction2: assert args.mmaction2_config and args.mmaction2_checkpoint, \ "MMAction2 config and checkpoint couldn't be None" data = np.random.randn(BATCH_SIZE, NUM_SEGMENTS, 3, INPUT_H, INPUT_W).astype(np.float32) # TensorRT inference np.copyto(host_in, data.ravel()) do_inference(context, host_in, host_out, BATCH_SIZE) # pytorch inference pytorch_results = inference_mmaction2(data, args.mmaction2_config, args.mmaction2_checkpoint) # test from numpy.testing import assert_array_almost_equal assert_array_almost_equal(host_out.reshape(-1), pytorch_results.reshape(-1), decimal=4) print("MMAction2 TEST PASSED") if args.test_cpp: assert args.cpp_result_path, "Should set --cpp-result-path" assert os.path.exists(args.cpp_result_path),\ f"{args.cpp_result} doesn't exist" # C++ API fixed inputs inputs = np.ones((BATCH_SIZE, NUM_SEGMENTS, 3, INPUT_H, INPUT_W), dtype=np.float32) # TensorRT inference np.copyto(host_in, inputs.ravel()) do_inference(context, host_in, host_out, BATCH_SIZE) # Read cpp inference results with open(args.cpp_result_path, "r") as f: data = f.read().strip() cpp_results = np.array([float(d) for d in data.split(" ")]).astype(np.float32) # test from numpy.testing import assert_array_almost_equal assert_array_almost_equal(host_out.reshape(-1), cpp_results.reshape(-1), decimal=4) print("CPP TEST PASSED") if args.input_video: # Get ONE prediction result from ONE video # Use demo.mp4 from MMAction2 import cv2 # get selected frame id of uniform sampling cap = cv2.VideoCapture(args.input_video) sample_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) avg_interval = sample_length / float(NUM_SEGMENTS) base_offsets = np.arange(NUM_SEGMENTS) * avg_interval clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int32) # read frames frames = [] for i in range(max(clip_offsets) + 1): flag, frame = cap.read() if i in clip_offsets: frames.append(cv2.resize(frame, (INPUT_W, INPUT_W))) frames = np.array(frames) # preprocessing frames mean = np.array([123.675, 116.28, 103.53]) std = np.array([58.395, 57.12, 57.375]) frames = (frames - mean) / std frames = frames.transpose([0, 3, 1, 2]) # TensorRT inference np.copyto(host_in, frames.ravel()) do_inference(context, host_in, host_out, BATCH_SIZE) # For demo.mp4, should be 6, aka arm wrestling class_id = np.argmax(host_out.reshape(-1)) print( f'Result class id {class_id}, socre {round(host_out[class_id]):.2f}' ) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "--tensorrt-weights", type=str, default=None, help="Path to TensorRT weights, which is generated by gen_weights.py") parser.add_argument("--input-video", type=str, default=None, help="Path to local video file") parser.add_argument("--save-engine-path", type=str, default=None, help="Save engine to local file") parser.add_argument("--load-engine-path", type=str, default=None, help="Saved engine file path") parser.add_argument("--test-mmaction2", action='store_true', help="Compare TensorRT results with MMAction2 Results") parser.add_argument("--mmaction2-config", type=str, default=None, help="Path to MMAction2 config file") parser.add_argument("--mmaction2-checkpoint", type=str, default=None, help="Path to MMAction2 checkpoint url or file path") parser.add_argument("--test-cpp", action='store_true', help="Compare Python API results with C++ API results") parser.add_argument("--cpp-result-path", type=str, default='./build/result.txt', help="Path to C++ API results") main(parser.parse_args()) ================================================ FILE: tutorials/check_fp16_int8_support.md ================================================ # Check if Your GPU Supports FP16/INT8 ## 1. check your GPU Compute Capability visit https://developer.nvidia.com/cuda-gpus#compute and check your GPU compute capability. For example, GTX1080 is 6.1, Tesla T4 is 7.5. ## 2. check the hardware-precision-matrix visit https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix and check the matrix. For example, compute capability 6.1 supports FP32 and INT8. 7.5 supports FP32, FP16, INT8, FP16 tensor core, etc. ================================================ FILE: tutorials/faq.md ================================================ # Frequently Asked Questions (FAQ) ## 1. fatal error: NvInfer.h: No such file or directory `NvInfer.h` is one of the headers of TensorRT. If you install the tensorrt DEB package, the headers should in `/usr/include/x86_64-linux-gnu/`. If you install tensorrt TAR or ZIP file, it is recommended to manage TensorRT with modern CMake syntax, e.g. [FindTensorRT.cmake](../lenet/FindTensorRT.cmake). `dpkg -L` can print out the contents of a DEB package. ``` $ dpkg -L libnvinfer-dev /. /usr /usr/lib /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu/libnvinfer_static.a /usr/lib/x86_64-linux-gnu/libmyelin_compiler_static.a /usr/lib/x86_64-linux-gnu/libmyelin_executor_static.a /usr/lib/x86_64-linux-gnu/libmyelin_pattern_library_static.a /usr/lib/x86_64-linux-gnu/libmyelin_pattern_runtime_static.a /usr/include /usr/include/x86_64-linux-gnu /usr/include/x86_64-linux-gnu/NvInfer.h /usr/include/x86_64-linux-gnu/NvInferRuntime.h /usr/include/x86_64-linux-gnu/NvInferRuntimeCommon.h /usr/include/x86_64-linux-gnu/NvInferVersion.h /usr/include/x86_64-linux-gnu/NvUtils.h /usr/share /usr/share/doc /usr/share/doc/libnvinfer-dev /usr/share/doc/libnvinfer-dev/copyright /usr/share/doc/libnvinfer-dev/changelog.Debian /usr/lib/x86_64-linux-gnu/libmyelin.so /usr/lib/x86_64-linux-gnu/libnvinfer.so ``` ## 2. fatal error: cuda_runtime_api.h: No such file or directory `cuda_runtime_api.h` is from cuda-cudart. If you met this error, you need find where it is and adapt the `include_directories` and `link_directories` of cuda in `CMakeLists.txt`. ``` $ dpkg -L cuda-cudart-dev-10-0 /. /usr /usr/local /usr/local/cuda-10.0 /usr/local/cuda-10.0/targets /usr/local/cuda-10.0/targets/x86_64-linux /usr/local/cuda-10.0/targets/x86_64-linux/lib /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudadevrt.a /usr/local/cuda-10.0/targets/x86_64-linux/lib/libOpenCL.so.1.1 /usr/local/cuda-10.0/targets/x86_64-linux/lib/libculibos.a /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a /usr/local/cuda-10.0/targets/x86_64-linux/include /usr/local/cuda-10.0/targets/x86_64-linux/include/cuda_runtime_api.h /usr/local/cuda-10.0/targets/x86_64-linux/include/cudart_platform.h /usr/local/cuda-10.0/targets/x86_64-linux/include/cuda_device_runtime_api.h /usr/local/cuda-10.0/targets/x86_64-linux/include/cuda_runtime.h /usr/lib /usr/lib/pkgconfig /usr/lib/pkgconfig/cudart-10.0.pc /usr/share /usr/share/doc /usr/share/doc/cuda-cudart-dev-10-0 /usr/share/doc/cuda-cudart-dev-10-0/changelog.Debian.gz /usr/share/doc/cuda-cudart-dev-10-0/copyright /usr/local/cuda-10.0/targets/x86_64-linux/lib/libOpenCL.so /usr/local/cuda-10.0/targets/x86_64-linux/lib/libOpenCL.so.1 /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so ``` ## 3. .wts not prepared or not in the right directory If .wts file not in the right directory. The loadWeights() function will report error. Error logs like following. By default, the .wts file usually should be put in the same dir as `build`. For example, `tensorrtx/yolov5/yolov5s.wts`. And the .wts path defined in `yolov5.cpp`. ``` std::map loadWeights(std::__cxx11::string): Assertion `input.is_open() && "Unable to load weight file."' failed. Aborted (core dumped) ``` ## 4. yolo -s failed, class_num not adapted If you train your own yolo model, you need set the `CLASS_NUM` in `yololayer.h`. Which is `80` by default. Otherwise, you will get errors like following. ``` [Convolution]: kernel weights has count xxx but xxx was expected void APIToModel(unsigned int, nvinfer1::IHostMemory**): Assertion `engine != nullptr' failed. Aborted (core dumped) ``` ================================================ FILE: tutorials/from_pytorch_to_trt_stepbystep_hrnet.md ================================================ # 使用 TRT 加速网络-零 本次教程以 HRNet 分类器(HRNet-W18-C-Small-v2)为例子 code:https://github.com/HRNet/HRNet-Image-Classification paper:https://arxiv.org/abs/1908.07919 ## 1 论文网络的基本了解 无论是仅仅使用网络还是要对网络改进,首先都要对网络有一定了解。对于这种比较火的网络,网上大批详解博客,可以多去阅读,加上论文,来对网络理解。 HRNet 分类器网络看起来很简单,如下图 ![682463-20200104221712824-157549407](https://user-images.githubusercontent.com/20653176/93749152-ff957680-fc2b-11ea-883c-79046e41ace8.png) 从网络中可看到基本组件很简单:卷积和 upsmple。【这里就表明网络 TRT 加速时不会有 plugin 的需求。】 参考博客: 1. https://www.cnblogs.com/darkknightzh/p/12150637.html 2. https://zhuanlan.zhihu.com/p/143385915 3. https://blog.csdn.net/weixin_37993251/article/details/88043650 4. https://blog.csdn.net/weixin_38715903/article/details/101629781?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.channel_param&depth_1-utm_source=dis ## 2 pytorch 代码跑通 跑通 demo 是很重要的一步。跑通后就可以一步一步跟进,看到底走了哪些层,这样心里就会有一个基本框架;然后可以生成 wts 文件;同时也可以生成 onnx 文件。 上述的**参考博客 4**中对代码有详细介绍,可以详细分析下。 建议:**对于运行环境,建议使用 anaconda 的 conda create 创建虚拟环境,这样没有一系列环境问题。** ```python conda create -n xx python=3.7 # 创建环境 activate xx # 激活 pip install xxxx # 安装包 deactivate xx # 推出环境 ``` 在生成 wts 文件时,没有必须每次都是去配置`gen_wts.py`,主要是读取模型,保存模型参数。只要 demo 文件跑通就可以随时保存为 wts。 ## 3 pytorch 代码 debug 这一步骤单独拉出来是因为在 debug 的过程中,要关注经过哪些层,预处理有哪些,后处理有哪些。另外在后面搭建 TRT 网络时,还要根据 debug 过程在中的一些信息来调试 trt 网络。 ## 4 网络的可视化 将 pytorch 模型保存为 onnx,可有可无。但是建议如果可以保存,就使用 onnx 来可视化网络。这样对网络架构一级每层的输入输出就会非常明了。 如果无法保存 onnx,搭建网络时,要根据 wts 来分析,比较麻烦。 另外强烈建议:**无论是否保存了 onnx,都要手动在纸上将网络在画一遍,,并且将每层的输出维度标注下来,这样搭建层比较多的网络时,不会晕,并且在 debugTRT 网络时可以有效定位错误。** 在手动画网络图时,可以给每个节点“标号”,利用该“标号”在搭建 TRT 网络时,可以很清楚知道 **“哪个节点输入,经过某种操作,输出哪个节点。”** 在 onnx 图中看到几个层一定要心里有数: 比如下面红线框出的一大块实际上就是 upsample 层 ![](imgs/93747936-0ae7a280-fc2a-11ea-86c1-9f72622402b9.png)) 下面的为 FC 层: ![image-20200918141448071](https://user-images.githubusercontent.com/20653176/93749177-0de39280-fc2c-11ea-8a20-b8ab0b3b940f.png) Conv+BN+Relu 层 ![image-20200918141632723](https://user-images.githubusercontent.com/20653176/93749201-189e2780-fc2c-11ea-9aad-0ac7723575c4.png) ResBlock 层 ![image-20200918141709487](https://user-images.githubusercontent.com/20653176/93749220-2358bc80-fc2c-11ea-998a-0892755dfbc0.png) 单击节点。会有详细信息,这些信息使搭建网络变得方便。 ![image-20200918141931327](https://user-images.githubusercontent.com/20653176/93749222-2489e980-fc2c-11ea-9025-c5d367efd7f9.png) 如果无法导出 onnx: 搭建网络时需要从 wts 中查看层名,各个卷积层信息需要从代码中分析。 ![image_f](https://user-images.githubusercontent.com/20653176/93750398-fd341c00-fc2d-11ea-9077-ee749b6aef41.png) ![image-20200918142959711](https://user-images.githubusercontent.com/20653176/93749484-8fd3bb80-fc2c-11ea-951d-3c1f403e521a.png) ## 5 TRT 搭建网络 搭建网络时就按照 onnx 图一层一层搭建。 几点建议: 1 要不断去查 API 的使用 https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/index.html 2 利用已有的模块,不要重复造轮子 3 各个层名使用 onnx 的 id,这样在搭建网络时不会晕。,根据 onnx 的结点信息,各层之间的连接也不会出错。 ## 6 TRT 网络 debug 搭建网络过程肯定会出错,debug 是必要的手段: 1 打印每层的维度 ```c++ Dims dim = id_1083->getOutput(0)->getDimensions(); std::cout << dim[0] << " " << dim[1] << " " << dim[2] << " " << dim[3] << std::endl; ``` **一般如果出现生成 engine 就失败的情况,就从 createEngine 的第一句开始调试,并且随时关注窗口输出,如果在某一层出现大量提示信息,那么该层就会有问题,就将该层的输入 tensor 维度和输出 tensor 维度信息都打印出来,看输出的维度是否正常。** 2 打印输出 TRT 是先构建网络,然后再 enqueue 时才能得到各层的输出信息,因此若想对比每一层的输出,需要将该层设置为 output 层 ```c++ out->getOutput(0)->setName(OUTPUT_BLOB_NAME); // out可替换为任意一层 network->markOutput(*out->getOutput(0)); ``` 3 关注输入层 data 数据层的 debug 无需第 2 步的做法,直接可以查看预处理后的结果。在 debug ## 7 TRT 代码整理 这里就是将 TRT 搭建的网络,能封装函数,就封装为函数模块,增加代码可读性。 ================================================ FILE: tutorials/getting_started.md ================================================ # Getting Started with TensorRTx ## 1. Setup the development environment (**RECOMMENDED**) If you prefer to run everything in a docker container, check [HERE](../docker/README.md) If you prefer to install every dependencies locally, check [HERE](./install.md) ## 2. Run TensorRTx demo It is recommended to go through the [lenet5](https://github.com/wang-xinyu/tensorrtx/tree/master/lenet) or [mlp](https://github.com/wang-xinyu/tensorrtx/tree/master/mlp) first. But if you are proficient in TensorRT, please check the readme file of the model you want directly. We use "lenet5" to explain how we build DL network with TensorRT API. ### 2.1. Export lenet5 weights in pytorch 1. Clone the [wang-xinyu/pytorchx](https://github.com/wang-xinyu/pytorchx) in your machine, then enter lenet folder: ```bash pip install torch git clone https://github.com/wang-xinyu/pytorchx cd pytorchx/lenet ``` 2. Run lenet5.py to generate lenet5.pth which is the pytorch serialized model. The lenet5 arch is defined in lenet5.py. ```bash python lenet5.py ``` 3. Run inference.py to generate lenet5.wts, which is weights file for tensorrt. ```bash python inference.py ``` The terminal output would be like: ```txt the output of lenet5 is [[0.0950, 0.0998, 0.1101, 0.0975, 0.0966, 0.1097, 0.0948, 0.1056, 0.0992, 0.0917]], shape is [1, 10]. cuda device count: 2 input: torch.Size([1, 1, 32, 32]) conv1 torch.Size([1, 6, 28, 28]) pool1: torch.Size([1, 6, 14, 14]) conv2 torch.Size([1, 16, 10, 10]) pool2 torch.Size([1, 16, 5, 5]) view: torch.Size([1, 400]) fc1: torch.Size([1, 120]) lenet out: tensor([[0.0950, 0.0998, 0.1101, 0.0975, 0.0966, 0.1097, 0.0948, 0.1056, 0.0992, 0.0917]], device='cuda:0', grad_fn=) ``` ### 2.2. Run lenet5 in TensorRT Clone the wang-xinyu/tensorrtx in your machine. Enter lenet folder, copy lenet5.wts generated above, and cmake&make c++ code. And of course you should install cuda/cudnn/tensorrt first. You might need to adapt the tensorrt path in CMakeLists.txt if you install tensorrt from tar package. ```bash git clone https://github.com/wang-xinyu/tensorrtx cd tensorrtx/lenet cp [PATH-OF-pytorchx]/pytorchx/lenet/lenet5.wts . cmake -S . -B build cd build make ``` If the `make` succeed, the executable `lenet` will generated. Run lenet to build tensorrt engine and serialize it to file `lenet5.engine`. ```bash ./lenet -s ``` Deserialize the engine and run inference. ```bash ./lenet -d ``` You should see the output like this, ```txt Output: 0.0949623, 0.0998472, 0.110072, 0.0975036, 0.0965564, 0.109736, 0.0947979, 0.105618, 0.099228, 0.0916792, ``` ## 3. Compare the two output As the input to pytorch and tensorrt are same, i.e. a [1,1,32,32] all ones tensor. So the output should be same, otherwise there must be something wrong. ```txt The pytorch output is 0.0950, 0.0998, 0.1101, 0.0975, 0.0966, 0.1097, 0.0948, 0.1056, 0.0992, 0.0917 The tensorrt output is 0.0949623, 0.0998472, 0.110072, 0.0975036, 0.0965564, 0.109736, 0.0947979, 0.105618, 0.099228, 0.0916792 ``` Same! exciting, isn't it? ## 4. The `.wts` content format The `.wts` is plain text file, e.g. `lenet5.wts`, part of the contents are: ```txt 10 conv1.weight 150 be40ee1b bd20bab8 bdc4bc53 ... conv1.bias 6 bd327058 ... conv2.weight 2400 3c6f2220 3c693090 ... conv2.bias 16 bd183967 bcb1ac8a ... fc1.weight 48000 3c162c20 bd25196a ... fc1.bias 120 3d3c3d49 bc64b948 ... fc2.weight 10080 bce095a4 3d33b9dc ... fc2.bias 84 bc71eaa0 3d9b276c ... fc3.weight 840 3c252870 3d855351 ... fc3.bias 10 bdbe4bb8 3b119ee0 ... ... ``` The first line is a number, indicate how many lines it has, excluding itself. And then each line is `[weight name] [value count = N] [value1] [value2], ..., [valueN]` The value is in HEX format. ## 5. Frequently Asked Questions (FAQ) check [HERE](./faq.md) for the answers of questions you may encounter. ================================================ FILE: tutorials/install.md ================================================ # Install the dependencies of tensorrtx Using docker as development environment is strongly recommended, you may check [HERE](../docker/README) for the deployment instructions of docker container and _ignore_ the rest of this document. While if this is not your case, we always recommend using major LTS version of your OS, Nvidia driver, CUDA, and so on. ## OS Ubuntu-22.04 is recommended. It is strongly recommended to use `apt` to manage packages in Ubuntu. ## Nvidia Related ### Driver You should install the nvidia driver first before anything else, go to [Ubuntu Driver Installation Guide](https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#ubuntu) for more details. **NOTE**: Since version 560, the installation step is a little different than before, check [HERE](https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#recent-updates) for more details. ### CUDA Go to [NVIDIA CUDA Installation Guide for Linux](https://developer.nvidia.com/cuda-10.0-download-archive) for the detailed steps. **NOTE**: - Do not forget to check [Post-installation Actions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions) to setup the environment correctly. - Make your CUDA version comply with your driver version - If you want multi-version CUDA, docker is strongly recommended. ### TensorRT check [HERE](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#downloading) to install TensorRT. ### (Optional) OpenCV ``` sudo apt-get update && sudo apt install libgtk-3-dev libopencv-dev ``` ## Verify installation ``` dpkg -l | grep cuda dpkg -l | grep nvinfer dpkg -l | grep opencv ``` ================================================ FILE: tutorials/measure_performance.md ================================================ # Measure performance of TensorRT ## 1. add some variables and structures see https://github.com/NVIDIA/TensorRT/tree/master/samples/sampleNMT for more detail. ```c++ // for rcnn, you can put these code into common.hpp #include "logging.h" // rcnn/logging.h static Logger gLogger{ Logger::Severity::kINFO }; static LogStreamConsumer gLogInfo{ LOG_INFO(gLogger) }; struct SimpleProfiler : public nvinfer1::IProfiler { struct Record { float time{ 0 }; int count{ 0 }; }; virtual void reportLayerTime(const char* layerName, float ms) { mProfile[layerName].count++; mProfile[layerName].time += ms; if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) { mLayerNames.push_back(layerName); } } SimpleProfiler(const char* name, const std::vector& srcProfilers = std::vector()) : mName(name) { for (const auto& srcProfiler : srcProfilers) { for (const auto& rec : srcProfiler.mProfile) { auto it = mProfile.find(rec.first); if (it == mProfile.end()) { mProfile.insert(rec); } else { it->second.time += rec.second.time; it->second.count += rec.second.count; } } } } friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value) { out << "========== " << value.mName << " profile ==========" << std::endl; float totalTime = 0; std::string layerNameStr = "TensorRT layer name"; int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); for (const auto& elem : value.mProfile) { totalTime += elem.second.time; maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); } auto old_settings = out.flags(); auto old_precision = out.precision(); // Output header { out << std::setw(maxLayerNameLength) << layerNameStr << " "; out << std::setw(12) << "Runtime, " << "%" << " "; out << std::setw(12) << "Invocations" << " "; out << std::setw(12) << "Runtime, ms" << std::endl; } for (size_t i = 0; i < value.mLayerNames.size(); i++) { const std::string layerName = value.mLayerNames[i]; auto elem = value.mProfile.at(layerName); out << std::setw(maxLayerNameLength) << layerName << " "; out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" << " "; out << std::setw(12) << elem.count << " "; out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl; } out.flags(old_settings); out.precision(old_precision); out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl; return out; } private: std::string mName; std::vector mLayerNames; std::map mProfile; }; ``` ## 2. set profiler for context and print the log ```c++ // you'd better set name for every layers // build engine // build context auto sp = SimpleProfiler("test"); context->setProfiler(&sp); context->enqueue(...); gLogInfo << sp << std::endl; ``` ================================================ FILE: tutorials/migration_guide.md ================================================ # Migration Guide ## Newest Migration Guide Please check [Page](https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html) For any archives version, please check this [Page](https://docs.nvidia.com/deeplearning/tensorrt/archives/index.html) ## (DEPRECATED) Migrating from TensorRT 4.x to 7.x **NOTE**: Both TensorRT 4.x and 7.x are **DEPRECATED** by NVIDIA officially, so this part is **outdated**. The following APIs are deprecated and replaced in TensorRT 7. - `DimsCHW`, replaced by `Dims3` - `addConvolution()`, replaced by `addConvolutionNd()` - `addPooling()`, replaced by `addPoolingNd()` - `addDeconvolution()`, replaced by `addDeconvolutionNd()` - `createNetwork()`, replaced by `createNetworkV2()` - `buildCudaEngine()`, replaced by `buildEngineWithConfig()` - `createPReLUPlugin()`, replaced by `addActivation()` with `ActivationType::kLEAKY_RELU` - `IPlugin` and `IPluginExt` class, replaced by `IPluginV2IOExt` or `IPluginV2DynamicExt` - Use the new `Logger` class defined in `logging.h` ================================================ FILE: tutorials/multi_GPU_processing.md ================================================ # How to Implement Multi-GPU Processing Maybe you hope to take advantage of multiple GPU to make inference even faster. Here are few tips to help you deal with it! Take **YOLO V4** as an example. ## 1. Make custom plugin (i.e. YOLO layer and Mish layer for YOLO V4) running asynchronically. To do this, we need to use CudaStream parameter in the kernels of all custom layers and use asynchronous functions. For example, in function ` forwardGpu()` of **yololayer.cu**, you need to do the following changes to make sure that the engine will be running on a specific CudaStream. 1) Change `cudaMemset(output + idx*outputElem, 0, sizeof(float))` to `cudaMemsetAsync(output + idx*outputElem, 0, sizeof(float), stream)` 2) Change `CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>(inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem)` to `CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem)` ## 2. Create an engine for each device you want to use. Maybe it is a good idea to create a struct to store the engine, context and buffer for each device individually. For example, ``` struct Plan{ IRuntime* runtime; ICudaEngine* engine; IExecutionContext* context; void buffers[2]; cudaStream_t stream; }; ``` And then use `cudaSetDevice()` to make each engine you create running on specific device. Moreover, to maximize performance, make sure that the engine file you are using to deserialize is the one tensor RT optimized for this device. ## 3. Use function wisely Here are some knowledge I learned when trying to parallelize the inference. 1) Do not use synchronized function , like `cudaFree()`, during inference. 2) Using `cudaMallocHost()` instead of `malloc()` when allocating memory on the host side. ================================================ FILE: tutorials/run_on_windows.md ================================================ # How to Compile and Run on Windows This tutorial can be applied to any models in this repo. Only need to adapt couple of lines. ## Environments * vs (only vs2015, vs2017 tested) * cuda * TensorRT * Cmake * opencv * dirent.h for windows, put into tensorrtx/include, download from https://github.com/tronkko/dirent ![image-20200828131208257](https://user-images.githubusercontent.com/20653176/91524367-99217f00-e931-11ea-9a13-fb420403b73b.png) ## Compile and Run ### 1. Modify CmakeLists.txt ```cmake cmake_minimum_required(VERSION 2.6) project(yolov5) # 1 set(OpenCV_DIR "D:\\opencv\\opencv346\\build") #2 set(TRT_DIR "D:\\TensorRT-7.0.0.11.Windows10.x86_64.cuda-10.2.cudnn7.6\\TensorRT-7.0.0.11") #3 add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads) # setup CUDA find_package(CUDA REQUIRED) message(STATUS " libraries: ${CUDA_LIBRARIES}") message(STATUS " include path: ${CUDA_INCLUDE_DIRS}") include_directories(${CUDA_INCLUDE_DIRS}) #### enable_language(CUDA) # add this line, then no need to setup cuda path in vs #### include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${TRT_DIR}\\include) # -D_MWAITXINTRIN_H_INCLUDED for solving error: identifier "__builtin_ia32_mwaitx" is undefined set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -D_MWAITXINTRIN_H_INCLUDED") # setup opencv find_package(OpenCV QUIET NO_MODULE NO_DEFAULT_PATH NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_SYSTEM_ENVIRONMENT_PATH NO_CMAKE_PACKAGE_REGISTRY NO_CMAKE_BUILDS_PATH NO_CMAKE_SYSTEM_PATH NO_CMAKE_SYSTEM_PACKAGE_REGISTRY ) message(STATUS "OpenCV library status:") message(STATUS " version: ${OpenCV_VERSION}") message(STATUS " libraries: ${OpenCV_LIBS}") message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") include_directories(${OpenCV_INCLUDE_DIRS}) link_directories(${TRT_DIR}\\lib) add_executable(yolov5 ${PROJECT_SOURCE_DIR}/yolov5.cpp ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/yololayer.h) #4 target_link_libraries(yolov5 "nvinfer" "nvinfer_plugin") #5 target_link_libraries(yolov5 ${OpenCV_LIBS}) #6 target_link_libraries(yolov5 ${CUDA_LIBRARIES}) #7 target_link_libraries(yolov5 Threads::Threads) #8 ``` Notice: 8 lines to adapt in CMakeLists.txt, marked with #1-#8 - #1 project name, set according to your project name - #2 your opencv path - #3 your tensorrt path - #4 source file needed, including .cpp .cu .h - #5-#8 libs needed ### 2. run cmake-gui to config the project #### 2.1 open cmake-gui and set the path ![image-20200828124434245](https://user-images.githubusercontent.com/20653176/91524158-1dbfcd80-e931-11ea-8a82-518eaf391d5a.png) #### 2.2 click **Configure** and set the envs ![image-20200828124902923](https://user-images.githubusercontent.com/20653176/91524303-75f6cf80-e931-11ea-8591-64a8a1a9292b.png) #### 2.3 click **Finish**, and wait for the `Configuring done` ![image-20200828124951872](https://user-images.githubusercontent.com/20653176/91524340-8b6bf980-e931-11ea-9ea4-141f5b94aa0a.png) #### 2.4 click **Generate** ![image-20200828125046738](https://user-images.githubusercontent.com/20653176/91524350-8eff8080-e931-11ea-9ed1-82c5af2f558f.png) #### 2.5 click **Open Project** ![image-20200828125215067](https://user-images.githubusercontent.com/20653176/91524352-9030ad80-e931-11ea-877e-dc08bfaef731.png) #### 2.6 Click **Generate -> Generate solution** ![image-20200828125402056](https://user-images.githubusercontent.com/20653176/91524356-9161da80-e931-11ea-84ba-177e12200e04.png) ### 3. run in command line cd to the path of exe (e.g. E:\LearningCodes\GithubRepo\tensorrtx\yolov5\build\Debug) ``` yolov5.exe -s // serialize model to plan file i.e. 'yolov5s.engine' yolov5.exe -d ../samples // deserialize plan file and run inference, the images in samples will be processed. ``` **Notice**: while serializing the model, the .wts should put in the parent dir of xxx.vcxproj, or just modify the .wts path in yolov5.cpp ![image-20200828125938472](https://user-images.githubusercontent.com/20653176/91524358-93c43480-e931-11ea-81b6-ae01b92e1146.png) ### 4. run in vs In vs, firstly `Set As Startup Project`, and then setup `Project ==> Properties ==> Configuration Properties ==> Debugging ==> Command Arguments` as `-s` or `-d ../yolov3-spp/samples`. Then can run or debug. ![image-20200828130117902](https://user-images.githubusercontent.com/20653176/91524360-94f56180-e931-11ea-9873-39bed7ee19f1.png) ![image-20200828130415658](https://user-images.githubusercontent.com/20653176/91524362-96bf2500-e931-11ea-8c79-8db3a25fc135.png) ![image-20200828131516231](https://user-images.githubusercontent.com/20653176/91524370-9a52ac00-e931-11ea-8c1a-acf828fe81b4.png) **Notice**: The .dll of tensorrt and opencv should be put in the same directory with exe file. Or set environment variables in windows.(Not recommended) ================================================ FILE: ufld/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(lane_det) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) # cuda directory include_directories(/usr/local/cuda/include/) link_directories(/usr/local/cuda/lib64/) # tensorrt #include_directories(/workspace/TensorRT-7.2.3.4/include/) #link_directories(/workspace/TensorRT-7.2.3.4/lib/) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(lane_det ${PROJECT_SOURCE_DIR}/lane_det.cpp) target_link_libraries(lane_det nvinfer) target_link_libraries(lane_det cudart) target_link_libraries(lane_det ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: ufld/README.md ================================================ # Ultra-Fast-Lane-Detection(UFLD) The Pytorch implementation is [Ultra-Fast-Lane-Detection](https://github.com/cfzd/Ultra-Fast-Lane-Detection). ## How to Run ``` 1. generate lane.wts and lane.onnx from pytorch with tusimple_18.pth git clone https://github.com/wang-xinyu/tensorrtx.git git clone https://github.com/cfzd/Ultra-Fast-Lane-Detection.git // download its weights 'tusimple_18.pth' // copy tensorrtx/ufld/gen_wts.py into Ultra-Fast-Lane-Detection/ // ensure the file name is tusimple_18.pth and lane.wts in gen_wts.py // go to Ultra-Fast-Lane-Detection python gen_wts.py // a file 'lane.wts' will be generated. // then ( not necessary ) python pth2onnx.py //a file 'lane.onnx' will be generated. 2. build tensorrtx/ufld and run mkdir build cd build cmake .. make sudo ./lane_det -s // serialize model to plan file i.e. 'lane.engine' sudo ./lane_det -d PATH_TO_YOUR_IMAGE_FOLDER // deserialize plan file and run inference, the images will be processed. ``` ## More Information 1. Changed the preprocess and postprocess in tensorrtx, give a different way to convert NHWC to NCHW in preprocess and just show the result using opencv rather than saving the result in postprocess. 2. If there are some bugs where you inference with multi batch_size, just modify the code in preprocess or postprocess, it's not complicated. 3. Some results are stored in resluts folder. ================================================ FILE: ufld/common.hpp ================================================ #ifndef LANE_DET_COMMON_H_ #define LANE_DET_COMMON_H_ #include #include #include #include #include #include #include #include "dirent.h" #include "NvInfer.h" #include #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnLeaky( INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int g, std::string lname, int i, bool use_bn = false ) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolution(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv"+ std::to_string(i) + ".weight"], weightMap[lname + ".conv" + std::to_string(i)+".bias"]); assert(conv1); conv1->setStride(DimsHW{s, s}); conv1->setPadding(DimsHW{p, p}); conv1->setNbGroups(g); if (use_bn) { IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".batchnorm"+std::to_string(i), 1e-5); auto relu = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } else { auto relu = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu); return relu; } } IActivationLayer* basicBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int stride, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolution(input, outch, DimsHW{ 3, 3 }, weightMap[lname + "conv1.weight"], emptywts); assert(conv1); conv1->setStride(DimsHW{ stride, stride }); conv1->setPadding(DimsHW{ 1, 1 }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "bn1", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); assert(relu1); IConvolutionLayer* conv2 = network->addConvolution(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + "conv2.weight"], emptywts); assert(conv2); conv2->setPadding(DimsHW{ 1, 1 }); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "bn2", 1e-5); IElementWiseLayer* ew1; if (inch != outch) { IConvolutionLayer* conv3 = network->addConvolution(input, outch, DimsHW{ 1, 1 }, weightMap[lname + "downsample.0.weight"], emptywts); assert(conv3); conv3->setStride(DimsHW{ stride, stride }); IScaleLayer* bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "downsample.1", 1e-5); ew1 = network->addElementWise(*bn3->getOutput(0), *bn2->getOutput(0), ElementWiseOperation::kSUM); } else { ew1 = network->addElementWise(input, *bn2->getOutput(0), ElementWiseOperation::kSUM); } IActivationLayer* relu2 = network->addActivation(*ew1->getOutput(0), ActivationType::kRELU); assert(relu2); return relu2; } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } #endif ================================================ FILE: ufld/gen_wts.py ================================================ import torch import struct #import models.crnn as crnn from model.model import parsingNet # Initialize model = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False) device = 'cpu' # Load model state_dict = torch.load('tusimple_18.pth', map_location='cpu')['model'] model.to(device).eval() f = open('lane.wts', 'w') f.write('{}\n'.format(len(state_dict.keys()))) for k, v in state_dict.items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') ================================================ FILE: ufld/lane_det.cpp ================================================ #include #include #include #include #include "cuda_runtime_api.h" #include "logging.h" #include "common.hpp" #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define BATCH_SIZE 1 static const int INPUT_C = 3; static const int INPUT_H = 288; static const int INPUT_W = 800; static const int OUTPUT_C = 101; static const int OUTPUT_H = 56; static const int OUTPUT_W = 4; static const int OUTPUT_SIZE = OUTPUT_C * OUTPUT_H * OUTPUT_W; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder,IBuilderConfig* builderConfig, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{INPUT_C, INPUT_H, INPUT_W }); assert(data); std::map weightMap = loadWeights("../lane.wts"); #if 0 /* print layer names */ for(std::map::iterator iter = weightMap.begin(); iter != weightMap.end() ; iter++) { std::cout << iter->first << std::endl; } #endif auto conv1 = network->addConvolution(*data, 64, DimsHW{ 7, 7 }, weightMap["model.conv1.weight"], emptywts); assert(conv1); conv1->setStride(DimsHW{2, 2}); conv1->setPadding(DimsHW{3, 3}); conv1->setNbGroups(1); auto bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "model.bn1", 1e-5); auto relu0 = network->addActivation(*bn1->getOutput(0), ActivationType::kRELU); IPoolingLayer* pool0 = network->addPooling(*relu0->getOutput(0), PoolingType::kMAX, DimsHW{ 3, 3 }); pool0->setStride( DimsHW{ 2, 2 } ); pool0->setPadding( DimsHW{ 1, 1 } ); assert(pool0); auto basic0 = basicBlock(network, weightMap, *pool0->getOutput(0), 64, 64, 1, "model.layer1.0."); auto basic1 = basicBlock(network, weightMap, *basic0->getOutput(0), 64, 64, 1, "model.layer1.1."); auto basic2_0 = basicBlock(network, weightMap, *basic1->getOutput(0), 64, 128, 2, "model.layer2.0."); auto basic2_1 = basicBlock(network, weightMap, *basic2_0->getOutput(0), 128, 128, 1, "model.layer2.1."); auto basic3_0 = basicBlock(network, weightMap, *basic2_1->getOutput(0), 128, 256, 2, "model.layer3.0."); auto basic3_1 = basicBlock(network, weightMap, *basic3_0->getOutput(0), 256, 256, 1, "model.layer3.1."); auto basic4_0 = basicBlock(network, weightMap, *basic3_1->getOutput(0), 256, 512, 2, "model.layer4.0."); auto basic4_1 = basicBlock(network, weightMap, *basic4_0->getOutput(0), 512, 512, 1, "model.layer4.1."); #if 0 /* just for debug */ Dims dims1 = basic4_1->getOutput(0)->getDimensions(); for (int i = 0; i < dims1.nbDims; i++) { std::cout << dims1.d[i] << "-" << (int)dims1.type[i] << " "; } std::cout << std::endl; #endif auto conv2 = network->addConvolution(*basic4_1->getOutput(0), 8, DimsHW{ 1, 1 }, weightMap["pool.weight"], weightMap["pool.bias"]); assert(conv2); conv2->setStride(DimsHW{1, 1}); conv2->setPadding(DimsHW{0, 0}); conv2->setNbGroups(1); IShuffleLayer* permute0 = network->addShuffle(*conv2->getOutput(0)); assert(permute0); permute0->setReshapeDimensions( Dims2{1, 1800}); auto fcwts0 = network->addConstant(nvinfer1::Dims2(2048, 1800), weightMap["cls.0.weight"]); auto matrixMultLayer0 = network->addMatrixMultiply(*permute0->getOutput(0), MatrixOperation::kNONE, *fcwts0->getOutput(0), MatrixOperation::kTRANSPOSE); assert(matrixMultLayer0 != nullptr); // Add elementwise layer for adding bias auto fcbias0 = network->addConstant(nvinfer1::Dims2(1, 2048), weightMap["cls.0.bias"]); auto addBiasLayer0 = network->addElementWise(*matrixMultLayer0->getOutput(0), *fcbias0->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); assert(addBiasLayer0 != nullptr); auto relu = network->addActivation(*addBiasLayer0->getOutput(0), ActivationType::kRELU); auto fcwts1 = network->addConstant(nvinfer1::Dims2(22624, 2048), weightMap["cls.2.weight"]); auto matrixMultLayer1 = network->addMatrixMultiply(*relu->getOutput(0), MatrixOperation::kNONE, *fcwts1->getOutput(0), MatrixOperation::kTRANSPOSE); assert(matrixMultLayer1 != nullptr); // Add elementwise layer for adding bias auto fcbias1 = network->addConstant(nvinfer1::Dims2(1, 22624), weightMap["cls.2.bias"]); auto addBiasLayer1 = network->addElementWise(*matrixMultLayer1->getOutput(0), *fcbias1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); assert(addBiasLayer1 != nullptr); IShuffleLayer* permute1 = network->addShuffle(*addBiasLayer1->getOutput(0)); assert(permute1); permute1->setReshapeDimensions( Dims3{ 101, 56, 4 }); permute1->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*permute1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); builderConfig->setMaxWorkspaceSize(16 * (1 << 20));// 16MB #ifdef USE_FP16 if(builder->platformHasFastFp16()) { std::cout << "Platform supports fp16 mode and use it !!!" << std::endl; builderConfig->setFlag(BuilderFlag::kFP16); } else { std::cout << "Platform doesn't support fp16 mode so you can't use it !!!" << std::endl; } #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *builderConfig); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* builderConfig = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, builderConfig, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } std::vector prepareImage(cv::Mat & img) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); cv::Mat resized; cv::resize(img, resized, cv::Size(INPUT_W, INPUT_H)); cv::Mat img_float; resized.convertTo(img_float, CV_32FC3, 1. / 255.); // HWC TO CHW std::vector input_channels(INPUT_C); cv::split(img_float, input_channels); // normalize std::vector result(INPUT_H * INPUT_W * INPUT_C); auto data = result.data(); int channelLength = INPUT_H * INPUT_W; static float mean[]= {0.485, 0.456, 0.406}; static float std[] = {0.229, 0.224, 0.225}; for (int i = 0; i < INPUT_C; ++i) { cv::Mat normed_channel = (input_channels[i] - mean[i]) / std[i]; memcpy(data, normed_channel.data, channelLength * sizeof(float)); data += channelLength; } return result; } /* (101,56,4), add softmax on 101_axis and calculate Expect */ void softmax_mul(float* x, float* y, int rows, int cols, int chan) { for(int i = 0, wh = rows * cols; i < rows; i++) { for(int j = 0; j < cols; j++) { float sum = 0.0; float expect = 0.0; for(int k = 0; k < chan - 1; k++) { x[k * wh + i * cols + j] = exp(x[k * wh + i * cols + j]); sum += x[k * wh + i * cols + j]; } for(int k = 0; k < chan - 1; k++) { x[k * wh + i * cols + j] /= sum; } for(int k = 0; k < chan - 1; k++) { x[k * wh + i * cols + j] = x[k * wh + i * cols + j] * (k + 1); expect += x[k * wh + i * cols + j]; } y[i * cols + j] = expect; } } } /* (101,56,4), calculate max index on 101_axis */ void argmax(float* x, float* y, int rows, int cols, int chan) { for(int i = 0,wh = rows * cols; i < rows; i++) { for(int j = 0; j < cols; j++) { int max = -10000000; int max_ind = -1; for(int k = 0; k < chan; k++) { if(x[k * wh + i * cols + j] > max) { max = x[k * wh + i * cols + j]; max_ind = k; } } y[i * cols + j] = max_ind; } } } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{ nullptr }; size_t size{ 0 }; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{ nullptr }; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("lane_det.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("lane_det.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./crnn -s // serialize model to plan file" << std::endl; std::cerr << "./crnn -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } /* prepare input data */ static float data[BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W]; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } int fcount = 0; int vis_h = 720; int vis_w = 1280; int col_sample_w = 8; for (int f = 0; f < (int)file_names.size(); f++) { cv::Mat vis; fcount++; if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b], 1); if (img.empty()) continue; cv::resize(img, vis, cv::Size(vis_w, vis_h)); std::vector result(INPUT_C * INPUT_W * INPUT_H); result = prepareImage(img); memcpy(data, &result[0], INPUT_C * INPUT_W * INPUT_H * sizeof(float)); } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); //prob: size (101, 56, 4) auto end = std::chrono::system_clock::now(); std::cout << "inference time is " << std::chrono::duration_cast(end - start).count() << " ms" << std::endl; std::vector tusimple_row_anchor { 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284 }; float max_ind[BATCH_SIZE * OUTPUT_H * OUTPUT_W]; float prob_reverse[BATCH_SIZE * OUTPUT_SIZE]; /* do out_j = out_j[:, ::-1, :] in python list*/ float expect[BATCH_SIZE * OUTPUT_H * OUTPUT_W]; for (int k = 0, wh = OUTPUT_W * OUTPUT_H; k < OUTPUT_C; k++) { for(int j = 0; j < OUTPUT_H; j ++) { for(int l = 0; l < OUTPUT_W; l++) { prob_reverse[k * wh + (OUTPUT_H - 1 - j) * OUTPUT_W + l] = prob[k * wh + j * OUTPUT_W + l]; } } } argmax(prob_reverse, max_ind, OUTPUT_H, OUTPUT_W, OUTPUT_C); /* calculate softmax and Expect */ softmax_mul(prob_reverse, expect, OUTPUT_H, OUTPUT_W, OUTPUT_C); for(int k = 0; k < OUTPUT_H; k++) { for(int j = 0; j < OUTPUT_W; j++) { max_ind[k * OUTPUT_W + j] == 100 ? expect[k * OUTPUT_W + j] = 0 : expect[k * OUTPUT_W + j] = expect[k * OUTPUT_W + j]; } } std::vector i_ind; for(int k = 0; k < OUTPUT_W; k++) { int ii = 0; for(int g = 0; g < OUTPUT_H; g++) { if(expect[g * OUTPUT_W + k] != 0) ii++; } if(ii > 2) { i_ind.push_back(k); } } for(int k = 0; k < OUTPUT_H; k++) { for(int ll = 0; ll < i_ind.size(); ll++) { if(expect[OUTPUT_W * k + i_ind[ll]] > 0) { cv::Point pp = { int(expect[OUTPUT_W * k + i_ind[ll]] * col_sample_w * vis_w / INPUT_W) - 1, int( vis_h * tusimple_row_anchor[OUTPUT_H - 1 - k] / INPUT_H) - 1 }; cv::circle(vis, pp, 8, CV_RGB(0, 255 ,0), 2); } } } cv::imshow("lane_vis",vis); cv::waitKey(0); } return 0; } ================================================ FILE: ufld/logging.h ================================================ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: ufld/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: ufld/pth2onnx.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms import torch.onnx as torch_onnx from model.model import parsingNet MODELPATH = "tusimple_18.pth" net = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False).cuda() state_dict = torch.load(MODELPATH, map_location='cpu')['model'] net.train(False) x = torch.randn(1, 3, 288, 800).cuda() torch_onnx.export(net, x, "lane.onnx", verbose=True, input_names=["input"], output_names=["output"],opset_version=11) ================================================ FILE: unet/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(unet) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) # cuda directory include_directories(/usr/local/cuda/include/) link_directories(/usr/local/cuda/lib64/) # tensorrt include_directories(/workspace/TensorRT-7.2.3.4/include/) link_directories(/workspace/TensorRT-7.2.3.4/lib/) # opencv library find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) # link library and add exec file add_executable(unet ${PROJECT_SOURCE_DIR}/unet.cpp) target_link_libraries(unet nvinfer) target_link_libraries(unet cudart) target_link_libraries(unet ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: unet/README.md ================================================ # UNet Pytorch model from [Pytorch-UNet](https://github.com/milesial/Pytorch-UNet). ## Contributors ## Requirements Now TensorRT 8.x is supported and you can use it. The key cause of the previous bug is the pooling layer Stride setting problem. ## Build and Run 1. Generate .wts ``` cp {path-of-tensorrtx}/unet/gen_wts.py Pytorch-UNet/ cd Pytorch-UNet/ wget https://github.com/milesial/Pytorch-UNet/releases/download/v3.0/unet_carvana_scale0.5_epoch2.pth python gen_wts.py unet_carvana_scale0.5_epoch2.pth ``` 2. Generate TensorRT engine ``` cd tensorrtx/unet/ mkdir build cd build cmake .. make cp {path-of-Pytorch-UNet}/unet.wts . ./unet -s ``` 3. Run inference ``` wget https://raw.githubusercontent.com/wang-xinyu/tensorrtx/f60dcc7bec28846cd973fc95ac829c4e57a11395/unet/samples/0cdf5b5d0ce1_01.jpg ./unet -d 0cdf5b5d0ce1_01.jpg ``` 4. Check result.jpg

# Benchmark Pytorch | TensorRT FP32 | TensorRT FP16 ---- | ----- | ------ 816x672 | 816x672 | 816x672 58ms | 43ms (batchsize 8) | 14ms (batchsize 8) ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: unet/common.hpp ================================================ #ifndef UNET_COMMON_H_ #define UNET_COMMON_H_ #include #include #include #include #include #include "NvInfer.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } #endif ================================================ FILE: unet/gen_wts.py ================================================ import torch import sys import struct def main(): device = torch.device('cpu') state_dict = torch.load(sys.argv[1], map_location=device) f = open("unet.wts", 'w') f.write("{}\n".format(len(state_dict.keys()))) for k, v in state_dict.items(): print('key: ', k) print('value: ', v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") f.close() if __name__ == '__main__': main() ================================================ FILE: unet/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: unet/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: unet/unet.cpp ================================================ #include #include #include "cuda_runtime_api.h" #include "logging.h" #include "common.hpp" #define DEVICE 0 #define USE_FP32 // USE_FP32 or USE_FP16 #define CONF_THRESH 0.5 #define BATCH_SIZE 1 #define cls 2 #define BILINEAR false // stuff we know about the network and the input/output blobs static const int INPUT_H = 640; static const int INPUT_W = 959; static const int OUTPUT_SIZE = INPUT_H * INPUT_W * cls; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; using namespace nvinfer1; ILayer* doubleConv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int ksize, std::string lname, int midch) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, midch, DimsHW{ ksize, ksize }, weightMap[lname + ".double_conv.0.weight"], emptywts); conv1->setStrideNd(DimsHW{ 1, 1 }); conv1->setPaddingNd(DimsHW{ 1, 1 }); conv1->setNbGroups(1); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".double_conv.1", 0); IActivationLayer* relu1 = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); IConvolutionLayer* conv2 = network->addConvolutionNd(*relu1->getOutput(0), outch, DimsHW{ 3, 3 }, weightMap[lname + ".double_conv.3.weight"], emptywts); conv2->setStrideNd(DimsHW{ 1, 1 }); conv2->setPaddingNd(DimsHW{ 1, 1 }); conv2->setNbGroups(1); IScaleLayer* bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".double_conv.4", 0); IActivationLayer* relu2 = network->addActivation(*bn2->getOutput(0), ActivationType::kLEAKY_RELU); assert(relu2); return relu2; } ILayer* down(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int p, std::string lname) { IPoolingLayer* pool1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{ 2, 2 }); pool1->setStrideNd(DimsHW{ 2, 2 }); assert(pool1); ILayer* dcov1 = doubleConv(network, weightMap, *pool1->getOutput(0), outch, 3, lname + ".maxpool_conv.1", outch); assert(dcov1); return dcov1; } ILayer* up(INetworkDefinition* network, std::map& weightMap, ITensor& input1, ITensor& input2, int resize, int outch, int midch, std::string lname) { if (BILINEAR) { // add upsample bilinear IResizeLayer* deconv1 = network->addResize(input1); auto outdims = input2.getDimensions(); deconv1->setOutputDimensions(outdims); deconv1->setResizeMode(ResizeMode::kLINEAR); deconv1->setAlignCorners(true); int diffx = input2.getDimensions().d[1] - deconv1->getOutput(0)->getDimensions().d[1]; int diffy = input2.getDimensions().d[2] - deconv1->getOutput(0)->getDimensions().d[2]; ILayer* pad1 = network->addPaddingNd(*deconv1->getOutput(0), DimsHW{ diffx / 2, diffy / 2 }, DimsHW{ diffx - (diffx / 2), diffy - (diffy / 2) }); // dcov1->setPaddingNd(DimsHW{diffx / 2, diffx - diffx / 2},DimsHW{diffy / 2, diffy - diffy / 2}); ITensor* inputTensors[] = { &input2,pad1->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); assert(cat); if (midch == 64) { ILayer* dcov1 = doubleConv(network, weightMap, *cat->getOutput(0), outch, 3, lname + ".conv", outch); assert(dcov1); return dcov1; } else { int midch1 = outch / 2; ILayer* dcov1 = doubleConv(network, weightMap, *cat->getOutput(0), midch1, 3, lname + ".conv", outch); assert(dcov1); return dcov1; } } else { IDeconvolutionLayer* deconv1 = network->addDeconvolutionNd(input1, resize, DimsHW{ 2, 2 }, weightMap[lname + ".up.weight"], weightMap[lname + ".up.bias"]); deconv1->setStrideNd(DimsHW{ 2, 2 }); deconv1->setNbGroups(1); int diffx = input2.getDimensions().d[1] - deconv1->getOutput(0)->getDimensions().d[1]; int diffy = input2.getDimensions().d[2] - deconv1->getOutput(0)->getDimensions().d[2]; ILayer* pad1 = network->addPaddingNd(*deconv1->getOutput(0), DimsHW{ diffx / 2, diffy / 2 }, DimsHW{ diffx - (diffx / 2), diffy - (diffy / 2) }); // dcov1->setPaddingNd(DimsHW{diffx / 2, diffx - diffx / 2},DimsHW{diffy / 2, diffy - diffy / 2}); ITensor* inputTensors[] = { &input2,pad1->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); assert(cat); ILayer* dcov1 = doubleConv(network, weightMap, *cat->getOutput(0), midch, 3, lname + ".conv", outch); assert(dcov1); return dcov1; } } ILayer* outConv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, std::string lname) { // Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, cls, DimsHW{ 1, 1 }, weightMap[lname + ".conv.weight"], weightMap[lname + ".conv.bias"]); assert(conv1); conv1->setStrideNd(DimsHW{ 1, 1 }); conv1->setPaddingNd(DimsHW{ 0, 0 }); conv1->setNbGroups(1); return conv1; } ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string wts_path) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); assert(data); std::map weightMap = loadWeights(wts_path); Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; // build network auto x1 = doubleConv(network, weightMap, *data, 64, 3, "inc", 64); auto x2 = down(network, weightMap, *x1->getOutput(0), 128, 1, "down1"); auto x3 = down(network, weightMap, *x2->getOutput(0), 256, 1, "down2"); auto x4 = down(network, weightMap, *x3->getOutput(0), 512, 1, "down3"); auto channel = 512; if (!BILINEAR) { channel = 1024; } auto x5 = down(network, weightMap, *x4->getOutput(0), channel, 1, "down4"); ILayer* x6 = up(network, weightMap, *x5->getOutput(0), *x4->getOutput(0), 512, 512, 512, "up1"); ILayer* x7 = up(network, weightMap, *x6->getOutput(0), *x3->getOutput(0), 256, 256, 256, "up2"); ILayer* x8 = up(network, weightMap, *x7->getOutput(0), *x2->getOutput(0), 128, 128, 128, "up3"); ILayer* x9 = up(network, weightMap, *x8->getOutput(0), *x1->getOutput(0), 64, 64, 64, "up4"); ILayer* x10 = outConv(network, weightMap, *x9->getOutput(0), OUTPUT_SIZE, "outc"); x10->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*x10->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** model_stream, std::string wts_path) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, wts_path); assert(engine != nullptr); // Serialize the engine (*model_stream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { cudaSetDevice(DEVICE); char* trt_model_stream = nullptr; size_t size = 0; std::string engine_name = "unet.engine"; std::string wts_path = "unet.wts"; if (argc == 2 && std::string(argv[1]) == "-s") { // Create a TensorRT model and serialize it to a file IHostMemory* model_stream{ nullptr }; APIToModel(BATCH_SIZE, &model_stream, wts_path); assert(model_stream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(model_stream->data()), model_stream->size()); model_stream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { // Load engine file std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trt_model_stream = new char[size]; assert(trt_model_stream); file.read(trt_model_stream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./unet -s // serialize model to plan file" << std::endl; std::cerr << "./unet -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } // Prepare input output data static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; static float prob[BATCH_SIZE * OUTPUT_SIZE]; // Deserialize engine IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trt_model_stream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trt_model_stream; cv::Mat img = cv::imread(argv[2]); // Preprocess cv::resize(img, img, cv::Size(INPUT_W, INPUT_H)); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = (img.at(i)[2]) / 255.0; data[i + INPUT_H * INPUT_W] = (img.at(i)[1]) / 255.0; data[i + 2 * INPUT_H * INPUT_W] = (img.at(i)[0]) / 255.0; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // Postprocess cv::Mat result = cv::Mat::zeros(INPUT_H, INPUT_W, CV_8UC3); for (int i = 0; i < INPUT_H * INPUT_W; i++) { float fmax = prob[i]; int index = 0; for (int j = 1; j < cls; j++) { if (prob[i + j * INPUT_H * INPUT_W] > fmax) { index = j; fmax = prob[i + j * INPUT_H * INPUT_W]; } } if (index == 1) { result.at(i) = cv::Vec3b(255, 255, 255); } } cv::imwrite("result.jpg", result); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: vgg/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(vgg) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) add_executable(vgg ${PROJECT_SOURCE_DIR}/vgg11.cpp) target_link_libraries(vgg nvinfer) target_link_libraries(vgg cudart) add_definitions(-O2 -pthread) ================================================ FILE: vgg/README.md ================================================ # vgg VGG 11-layer model (configuration "A") from "Very Deep Convolutional Networks For Large-Scale Image Recognition" For the Pytorch implementation, you can refer to [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg) VGG's architecture is simple, just some conv, relu, maxpool, and fc layers. ``` // 1. generate vgg.wts from [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg) // 2. put vgg.wts into tensorrtx/vgg // 3. build and run cd tensorrtx/vgg mkdir build cd build cmake .. make sudo ./vgg -s // serialize model to plan file i.e. 'vgg.engine' sudo ./vgg -d // deserialize plan file and run inference // 4. see if the output is same as pytorchx/vgg ``` ================================================ FILE: vgg/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: vgg/vgg11.cpp ================================================ #include "NvInfer.h" #include "cuda_runtime_api.h" #include #include #include #include #include #include #include "logging.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) // stuff we know about the network and the input/output blobs static const int INPUT_H = 224; static const int INPUT_W = 224; static const int OUTPUT_SIZE = 1000; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; using namespace nvinfer1; static Logger gLogger; // Load weights from files shared with TensorRT samples. // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape { 3, INPUT_H, INPUT_W } with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../vgg.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 64, DimsHW{3, 3}, weightMap["features.0.weight"], weightMap["features.0.bias"]); assert(conv1); conv1->setPaddingNd(DimsHW{1, 1}); IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); assert(relu1); IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); assert(pool1); pool1->setStrideNd(DimsHW{2, 2}); conv1 = network->addConvolutionNd(*pool1->getOutput(0), 128, DimsHW{3, 3}, weightMap["features.3.weight"], weightMap["features.3.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool1->setStrideNd(DimsHW{2, 2}); conv1 = network->addConvolutionNd(*pool1->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.6.weight"], weightMap["features.6.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); conv1 = network->addConvolutionNd(*relu1->getOutput(0), 256, DimsHW{3, 3}, weightMap["features.8.weight"], weightMap["features.8.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool1->setStrideNd(DimsHW{2, 2}); conv1 = network->addConvolutionNd(*pool1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.11.weight"], weightMap["features.11.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); conv1 = network->addConvolutionNd(*relu1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.13.weight"], weightMap["features.13.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool1->setStrideNd(DimsHW{2, 2}); conv1 = network->addConvolutionNd(*pool1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.16.weight"], weightMap["features.16.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); conv1 = network->addConvolutionNd(*relu1->getOutput(0), 512, DimsHW{3, 3}, weightMap["features.18.weight"], weightMap["features.18.bias"]); conv1->setPaddingNd(DimsHW{1, 1}); relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU); pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool1->setStrideNd(DimsHW{2, 2}); IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool1->getOutput(0), 4096, weightMap["classifier.0.weight"], weightMap["classifier.0.bias"]); assert(fc1); relu1 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU); fc1 = network->addFullyConnected(*relu1->getOutput(0), 4096, weightMap["classifier.3.weight"], weightMap["classifier.3.bias"]); relu1 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU); fc1 = network->addFullyConnected(*relu1->getOutput(0), 1000, weightMap["classifier.6.weight"], weightMap["classifier.6.bias"]); fc1->getOutput(0)->setName(OUTPUT_BLOB_NAME); std::cout << "set name out" << std::endl; network->markOutput(*fc1->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "build out" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { if (argc != 2) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./vgg -s // serialize model to plan file" << std::endl; std::cerr << "./vgg -d // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("vgg.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 1; } else if (std::string(argv[1]) == "-d") { std::ifstream file("vgg.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { return -1; } static float data[3 * INPUT_H * INPUT_W]; for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) data[i] = 1; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; // Run inference static float prob[OUTPUT_SIZE]; for (int i = 0; i < 10; i++) { auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution std::cout << "\nOutput:\n\n"; for (unsigned int i = 0; i < OUTPUT_SIZE; i++) { std::cout << prob[i] << ", "; if (i % 10 == 0) std::cout << i / 10 << std::endl; } std::cout << std::endl; return 0; } ================================================ FILE: vit/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.17.0) project( vit VERSION 0.1 LANGUAGES C CXX CUDA) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 80 86 89 90 100 120) endif() set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF) find_package(Threads REQUIRED) find_package(CUDAToolkit REQUIRED) find_package(OpenCV REQUIRED) if(NOT TARGET TensorRT::TensorRT) include(FindTensorRT.cmake) else() message("TensorRT has been found, skipping for ${PROJECT_NAME}") endif() add_executable(${PROJECT_NAME} "${PROJECT_NAME}.cc" "cuda_allocator.cc" "profiler.cc") target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS}) target_link_libraries( ${PROJECT_NAME} PUBLIC Threads::Threads CUDA::cudart CUDA::cuda_driver TensorRT::TensorRT ${OpenCV_LIBS}) if(WIN32) set_target_properties( ${PROJECT_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() ================================================ FILE: vit/FindTensorRT.cmake ================================================ cmake_minimum_required(VERSION 3.17.0) function(_guess_path var_name required_files) set(_result "") foreach(path_entry IN LISTS ARGN) if(NOT EXISTS "${path_entry}") message(DEBUG "skip non-existing path '${path_entry}'") continue() endif() set(_ok TRUE) foreach(required_file IN LISTS required_files) if(NOT EXISTS "${path_entry}/${required_file}") set(_ok FALSE) message(DEBUG "'${path_entry}' missing '${required_file}'") break() endif() endforeach() if(_ok) list(APPEND _result "${path_entry}") message(DEBUG "accept '${path_entry}'") else() message(DEBUG "reject '${path_entry}'") endif() endforeach() if(_result STREQUAL "") message( FATAL_ERROR "_guess_path(${var_name}) failed: no valid path found. required_files='${required_files}' candidates='${ARGN}'" ) endif() set(${var_name} "${_result}" PARENT_SCOPE) endfunction() # add library add_library(TensorRT IMPORTED INTERFACE) add_library(TensorRT::TensorRT ALIAS TensorRT) set(TRT_VERSION CACHE STRING "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\", \"8.6.1.6.Windows10.x86_64.cuda-12.0\" etc" ) if(NOT TRT_VERSION STREQUAL "" AND NOT $ENV{TRT_VERSION} STREQUAL "") message( WARNING "TRT_VERSION defined by cmake and environment variable both, using the later one" ) endif() if(NOT $ENV{TRT_VERSION} STREQUAL "") set(TRT_VERSION $ENV{TRT_VERSION}) endif() string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION}) set(TRT_MAJOR_VERSION "${_match}") unset(_match) if(WIN32) set(TensorRT_DIR "C:/Program Files/TensorRT-${TRT_VERSION}") if(NOT EXISTS "${TensorRT_DIR}") message(FATAL_ERROR "TensorRT_DIR=${TensorRT_DIR} does not exist!") endif() if(${TRT_MAJOR_VERSION} GREATER_EQUAL 10) set(_modules nvinfer_10 nvinfer_plugin_10 nvinfer_vc_plugin_10 nvinfer_dispatch_10 nvinfer_lean_10) message(DEBUG "Using ${_modules}") else() set(_modules nvinfer nvinfer_plugin nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() set(TensorRT_LIBRARY_DIR "${TensorRT_DIR}/lib") set(TensorRT_INCLUDE_DIR "${TensorRT_DIR}/include") elseif(UNIX) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" _trt_arch) set(_trt_include_candidates) if(_trt_arch MATCHES "^(aarch64|arm64|arch64)$") set(_trt_include_candidates "/usr/include/aarch64-linux-gnu" "/usr/include" "/usr/local/cuda/targets/aarch64-linux/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/aarch64-linux-gnu/lib" "/usr/lib/aarch64-linux-gnu" "/usr/lib/aarch64-linux-gnu/tegra" "/usr/lib") elseif(_trt_arch MATCHES "^(x86_64|amd64)$") set(_trt_include_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/include" "/usr/include/x86_64-linux-gnu" "/usr/include") set(_trt_library_candidates "/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" "/usr/lib/x86_64-linux-gnu" "/usr/lib") else() message(FATAL_ERROR "Unknown architecture") endif() set(_modules nvinfer nvinfer_plugin) if(${TRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean) endif() _guess_path(TensorRT_LIBRARY_DIR "libnvinfer.so;libnvinfer_plugin.so" ${_trt_library_candidates}) message(STATUS "TensorRT libraries: ${TensorRT_LIBRARY_DIR}") _guess_path(TensorRT_INCLUDE_DIR "NvInfer.h" ${_trt_include_candidates}) message(STATUS "TensorRT includes: ${TensorRT_INCLUDE_DIR}") endif() foreach(lib IN LISTS _modules) find_library( TensorRT_${lib}_LIBRARY NAMES ${lib} HINTS ${TensorRT_LIBRARY_DIR}) list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY}) endforeach() target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES}) message(STATUS "Found TensorRT libs: ${TensorRT_LIBRARIES}") set_target_properties( TensorRT PROPERTIES C_STANDARD 17 CXX_STANDARD 17 POSITION_INDEPENDENT_CODE ON SKIP_BUILD_RPATH TRUE BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH "$ORIGIN" INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}") unset(TRT_MAJOR_VERSION) unset(_modules) unset(_trt_include_candidates) unset(_trt_library_candidates) unset(_trt_arch) ================================================ FILE: vit/README.md ================================================ # Vision Transformers (ViT) ## 1. Overview This is a handwritten TensorRT implementation of the Vision Transformers[arxiv.org.2010.11929](https://arxiv.org/abs/2010.11929) paper. **Note**: - Swi-GeLU activation layer is supported since TensorRT **10.0**+ SDK, we can use a approximation way as TensorRT does, check below for details. ## 2. Details ### 2.1 Features - Support TensorRT SDK 8.5.1+ ~ 10.15.1+ - Support Windows11 OS - Support native or self-implemented Swi-GeLU - Support native or self-implemented multihead self-attention - Support a dummy profiler by default - Support a dummy output allocator by default - Use optimization profile by default ### 2.2 Current limitations - cannot use `IAttenion` with TensorRT SDK 10.14 ~ 10.15 because of the bugs in TensorRT - TensorRT < 8 is not supported because some ops are not inplemented in cuDNN - SM < 86, TensorRT < 10, CUDA < 12 cases are _NOT_ fully tested yet ### 2.3 Usage 1. use `gen_wts.py` to generate `.wts` file. ```bash python gen_wts.py ``` 2. build C++ code ```bash pushd tensorrtx/vit cmake -S . -B build -G Ninja --fresh cmake --build build ``` 3. serialize `.wts` model to engine file. ```bash ./build/vit -s ``` 4. run inference ```bash ./build/vit -d ``` On **RTX 4080, TensorRT 10.15.1 SDK**, the output looks like: ```bash ... ==== 1880us -1.125, 0.4623, -0.1215, -0.007384, -0.004307, -0.7021, -0.748, 0.2031, -0.4862, -0.008939, -1.151, -0.408, -0.3259, 0.2202, 0.04537, -2.008, -0.2832, 0.04394, 0.5326, 0.1724, 0.5655, ==== prediction result: Top: 0 idx: 285, logits: 8.262, label: Egyptian cat Top: 1 idx: 281, logits: 7.872, label: tabby, tabby cat Top: 2 idx: 282, logits: 6.477, label: tiger cat ========== VisionTransformerProfiler ========== TensorRT layer name Runtime, % Invocations Runtime, ms Reformatting CopyNode for Input Tensor 0 to patch embedding 3.2% 20 0.95 patch embedding 1.5% 20 0.45 Reformatting CopyNode for Input Tensor 0 to {ForeignNode[(Unnamed Layer* 3) [Constant]...(Unnamed Layer* 518) [ElementWise]]} 0.2% 20 0.06 __myl_ReshTran_myl3_0 0.8% 20 0.24 __myl_ConcAddCastMeanSubMulMeanAddSqrtDivMulCastMulAdd_myl3_1 0.3% 20 0.08 vit.encoder.layer.0.attentionvalue+vit.encoder.layer.0.attentionkey+vit.encoder.layer.0.attentionquery_myl3_2 1.4% 20 0.40 __myl_TranReshMove_myl3_3 0.2% 20 0.06 __myl_TranReshMove_myl3_4 0.2% 20 0.07 __myl_TranReshMove_myl3_5 0.2% 20 0.06 _gemm_mha_v2_myl3_6 0.5% 20 0.14 __myl_MoveReshTran_myl3_7 0.2% 20 0.06 ... ========== VisionTransformerProfiler total runtime = 29.67 ms ========== ``` as is shown above, we successfully triggered the internal MHA fused kernel fusion pass inside TensorRT (i.e., **"Myelin"** or **"myl"** in short), especially the MHA fused kernel: `_gemm_mha_v2_myl3_6`. ## 3. transformer details `ViTLayer()` builds one ViT encoder block (Transformer encoder layer) using TensorRT primitives. The implementation corresponds to a **Pre-LayerNorm** Transformer layer (typical for ViT), including: - LayerNorm before attention - Multi-Head Self-Attention (MHSA): QKV projections → scaled dot-product attention → output projection - Residual connection - LayerNorm after attention - Feed-Forward Network (FFN / MLP): dense → GeLU → dense - Residual connection The function returns the final residual output tensor. ### 3.1 Notation and Tensor Shapes Let the input tensor (TensorRT `input`) be: $$ \mathbf{X} \in \mathbb{R}^{N \times L \times D} $$ Where: - (N): batch size (represented by `N` in your code) - (L): sequence length (number of tokens; dynamic in code via `-1`) - (D): hidden size, fixed at 768 in this implementation The attention head configuration: $$ H = \tt{param.head\_num}, \qquad d = \frac{D}{H} $$ ### 3.2 Weight shapes (conceptual) For a standard Transformer block: - Q/K/V projection weights: $$ \mathbf{W}_Q, \mathbf{W}_K, \mathbf{W}_V \in \mathbb{R}^{D \times D} $$ - Q/K/V biases (**NOTE**:Not used by native nvidia interface): $$ \mathbf{b}_Q, \mathbf{b}_K, \mathbf{b}_V \in \mathbb{R}^{D} $$ - Output projection: $$ \mathbf{W}_O \in \mathbb{R}^{D \times D}, \quad \mathbf{b}_O \in \mathbb{R}^{D} $$ - FFN (MLP) with expansion ratio 4: $$ \mathbf{W}_1 \in \mathbb{R}^{D \times 4D}, \ \mathbf{b}_1 \in \mathbb{R}^{4D} $$ $$ \mathbf{W}_2 \in \mathbb{R}^{4D \times D}, \ \mathbf{b}_2 \in \mathbb{R}^{D} $$ Here ($4 D = 3072$). ### 3.3 High-Level Block Structure _Pre-LN Transformer Encoder Layer_ implements the following canonical computation: $$ \begin{aligned} \mathbf{X}' &= \mathrm{LN}_1(\mathbf{X}) \\ \mathbf{A} &= \mathrm{MHSA}(\mathbf{X}') \\ \mathbf{Y} &= \mathbf{X} + \mathbf{A} \\ \mathbf{Y}' &= \mathrm{LN}_2(\mathbf{Y}) \\ \mathbf{F} &= \mathrm{FFN}(\mathbf{Y}') \\ \mathbf{Z} &= \mathbf{Y} + \mathbf{F} \end{aligned} $$ The function returns ($\mathbf{Z}$). ### 3.4 LayerNorm Definition LayerNorm is applied over the **last dimension** (D) (hidden size), independently for each ($(n, \ell)$) position. For a token vector ($\mathbf{x} \in \mathbb{R}^{D}$): $$ \mathrm{LN}(\mathbf{x}) = \gamma \odot \frac{\mathbf{x} - \mu}{\sqrt{\sigma^2 + \varepsilon}} + \beta $$ Where: $$ \mu = \frac{1}{D}\sum_{i=1}^{D} x_i, \qquad \sigma^2 = \frac{1}{D}\sum_{i=1}^{D}(x_i - \mu)^2 $$ - ($\gamma$) corresponds to `.weight` - ($\beta$) corresponds to `.bias` - ($\varepsilon = \tt{param.lnorm\_eps}$) ### 3.5 QKV Projections (Code Section 2.1) #### 3.5.1 Linear projections Let: $$ \mathbf{X}' = \mathrm{LN}_1(\mathbf{X}) $$ Compute: $$ \begin{aligned} \mathbf{Q} &= \mathbf{X}' \mathbf{W}_Q^\top + \mathbf{b}_Q \ \mathbf{K} &= \mathbf{X}' \mathbf{W}_K^\top + \mathbf{b}_K \ \mathbf{V} &= \mathbf{X}' \mathbf{W}_V^\top + \mathbf{b}_V \end{aligned} \qquad \mathbf{Q},\mathbf{K},\mathbf{V} \in \mathbb{R}^{N \times L \times D} $$ #### 3.5.2 Multi-Head Reshape + Transpose (Shuffle Layers) Multi-head attention splits the hidden dimension (D) into (H) heads of size (d). #### 3.5.3 Reshape and transpose Starting from: $$ \mathbf{Q} \in \mathbb{R}^{N \times L \times D} $$ Reshape: $$ \mathbf{Q}_r \in \mathbb{R}^{N \times L \times H \times d} $$ Transpose (swap axes to put heads first): $$ \mathbf{Q}_h \in \mathbb{R}^{N \times H \times L \times d} $$ Same for ($\mathbf{K}$) and ($\mathbf{V}$). Code: ```cpp q_s->setReshapeDimensions(Dims4{N, -1, H, d}); q_s->setSecondTranspose({0, 2, 1, 3}); // (N,H,L,d) ``` #### 3.5.4 SDPA (Scaled Dot-Product Attention) For each batch (n) and head (h), define: $$ \mathbf{Q}^{(n,h)} \in \mathbb{R}^{L \times d}, \quad \mathbf{K}^{(n,h)} \in \mathbb{R}^{L \times d}, \quad \mathbf{V}^{(n,h)} \in \mathbb{R}^{L \times d} $$ #### 3.5.5 Attention logits ($QK^\top$) $$ \mathbf{S}^{(n,h)} = \mathbf{Q}^{(n,h)} \left(\mathbf{K}^{(n,h)}\right)^\top \in \mathbb{R}^{L \times L} $$ In tensor form: $$ \mathbf{S} \in \mathbb{R}^{N \times H \times L \times L} $$ Code: ```cpp qk = MatMul(q_s, NONE, k_s, TRANSPOSE); // (N,H,L,d) x (N,H,d,L) -> (N,H,L,L) ``` #### 3.5.6 Scaling Scaled dot-product uses: $$ \alpha = \frac{1}{\sqrt{d}} $$ $$ \tilde{\mathbf{S}} = \alpha \mathbf{S} $$ Code: ```cpp scale_val = 1/sqrt(d); attn_qk = qk * scale; // ElementWise PROD ``` #### 3.5.7 Softmax normalization Softmax is applied on the **last dimension** (keys index), for each query position, So: $$ \mathbf{P} \in \mathbb{R}^{N \times H \times L \times L} $$ Code: ```cpp qk_softmax = SoftMax(attn_qk); qk_softmax->setAxes(1U << (nbDims-1)); // last axis ``` #### 3.5.8 Weighted sum of values Each head output: $$ \mathbf{O}^{(n,h)} = \mathbf{P}^{(n,h)} \mathbf{V}^{(n,h)} \in \mathbb{R}^{L \times d} $$ Thus: $$ \mathbf{O} \in \mathbb{R}^{N \times H \times L \times d} $$ Code: ```cpp attn_qkv = MatMul(qk_softmax, NONE, v_s, NONE); // (N,H,L,L)x(N,H,L,d)->(N,H,L,d) ``` ### 3.6 Merge Heads + Output Projection #### 3.6.1 Merge heads Transpose back: $$ \mathbf{O} \in \mathbb{R}^{N \times H \times L \times d} \ \xrightarrow{\text{transpose}} \mathbb{R}^{N \times L \times H \times d} $$ Then reshape: $$ \mathbb{R}^{N \times L \times (H\cdot d)} = \mathbb{R}^{N \times L \times D} $$ Code: ```cpp attn_out->setFirstTranspose({0, 2, 1, 3}); // (N,L,H,d) attn_out->setReshapeDimensions(Dims3{N, -1, 768}); // (N,L,D) ``` #### 3.6.2 Output projection $$ \mathbf{A} = \mathbf{O}_{\text{merged}} \mathbf{W}_O^\top + \mathbf{b}_O \quad\in\mathbb{R}^{N \times L \times D} $$ Code: ```cpp attn_fcw = MatMul(attn_out, out_proj_w^T); attn_fcb = attn_fcw + out_proj_b; ``` ### 3.7 Residual Connection After Attention $$ \mathbf{Y} = \mathbf{X} + \mathbf{A} \quad\in\mathbb{R}^{N \times L \times D} $$ Code: ```cpp attn_residual = input + attn_fcb; ``` This identity path is crucial for gradient flow and stability; at inference time it preserves a “direct” signal path even if attention becomes sharp or noisy. ### 3.8 Post-Attention LayerNorm $$ \mathbf{Y}' = \mathrm{LN}_2(\mathbf{Y}) $$ Code: ```cpp post_lnorm = Normalization(attn_residual, post_ln_scale, post_ln_bias) ``` ### 3.9 Feed-Forward Network (FFN / MLP) ViT uses a 2-layer MLP with expansion ratio 4 and GeLU activation. #### 3.9.1 First dense layer (expand to 3072) $$ \mathbf{H} = \mathbf{Y}' \mathbf{W}_1^\top + \mathbf{b}_1 \quad\in\mathbb{R}^{N \times L \times 4D} $$ Code: ```cpp inter0 = MatMul(post_lnorm, iw^T); // iw shape conceptually (4D, D) inter1 = inter0 + ib; ``` #### 3.9.2 GeLU activation $$ \mathrm{GeLU}(x) = x \Phi(x) $$ Where (\Phi) is the standard normal CDF. Common tanh approximation (widely used in implementations): $$ \mathrm{GeLU}(x) \approx \frac {x\times \bigg(1+\tanh\Big(\sqrt\frac{2}{\pi}\times (x+0.044715\times x^3)\Big)\bigg)} {2} $$ Code calls: ```cpp inter_act = addGeLU(net, inter1); ``` #### 3.9.3 Second dense layer (project back to 768) $$ \mathbf{F} = \mathrm{GeLU}(\mathbf{H}) \mathbf{W}_2^\top + \mathbf{b}_2 \quad\in\mathbb{R}^{N \times L \times D} $$ Code: ```cpp out0 = MatMul(inter_act, ow^T); // ow conceptually (D, 4D) out1 = out0 + ob; ``` ### 3.10 Final Residual Connection $$ \mathbf{Z} = \mathbf{Y} + \mathbf{F} \quad\in\mathbb{R}^{N \times L \times D} $$ Code: ```cpp output_residual = out1 + attn_residual; return output_residual; ``` ## 4. Compact Step-by-Step Shape Trace Below is a shape trace aligned with the main operations (assuming dynamic (L)): Input $$ \mathbf{X}: (N, L, 768) $$ Pre-LN $$ \mathbf{X}': (N, L, 768) $$ Q/K/V projections $$ \mathbf{Q},\mathbf{K},\mathbf{V}: (N, L, 768) $$ Reshape + transpose to heads $$ \mathbf{Q}\_h,\mathbf{K}\_h,\mathbf{V}\_h: (N, H, L, d) $$ Attention logits $$ \mathbf{S}: (N, H, L, L) $$ Softmax weights $$ \mathbf{P}: (N, H, L, L) $$ Head outputs $$ \mathbf{O}: (N, H, L, d) $$ Merge heads $$ \mathbf{O}\_{\text{merged}}: (N, L, 768) $$ Output projection $$ \mathbf{A}: (N, L, 768) $$ Residual $$ \mathbf{Y}: (N, L, 768) $$ Post-LN $$ \mathbf{Y}': (N, L, 768) $$ FFN expand $$ \mathbf{H}: (N, L, 3072) $$ FFN project $$ \mathbf{F}: (N, L, 768) $$ Final residual $$ \mathbf{Z}: (N, L, 768) $$ ================================================ FILE: vit/cuda_allocator.cc ================================================ #include "cuda_allocator.h" #include #include #include #include #include #include "macros.h" #include "utils.h" namespace { constexpr int kCudaVersionAsyncMin = 11020; constexpr int kCudaVersionCuMemMin = 12000; } // namespace struct CudaOutputAllocator::Allocation { void* ptr{nullptr}; std::size_t size{0}; OutputAllocKind kind{OutputAllocKind::kCudaMallocManaged}; CUmemGenericAllocationHandle handle{}; CUdeviceptr addr{}; std::size_t mapped_size{0}; }; static auto getCudaRuntimeVersion() -> int { int version = 0; if (cudaRuntimeGetVersion(&version) != cudaSuccess) { return 0; } return version; } static auto getCudaDriverVersion() -> int { int version = 0; if (cudaDriverGetVersion(&version) != cudaSuccess) { return 0; } return version; } std::unique_ptr CudaOutputAllocator::Create(cudaStream_t stream, int device) { CHECK(cudaSetDevice(device)); const int rt = getCudaRuntimeVersion(); const int drv = getCudaDriverVersion(); OutputAllocKind kind = OutputAllocKind::kCudaMallocManaged; if (rt >= kCudaVersionCuMemMin && drv >= kCudaVersionCuMemMin) { kind = OutputAllocKind::kCuMem; } else if (rt >= kCudaVersionAsyncMin) { kind = OutputAllocKind::kCudaMallocAsync; } return std::make_unique(stream, kind, device); } CudaOutputAllocator::CudaOutputAllocator(cudaStream_t stream, OutputAllocKind kind, int device) : stream_(stream), kind_(kind), device_(device) {} CudaOutputAllocator::~CudaOutputAllocator() { std::lock_guard lock(mutex_); for (auto& entry : allocations_) { release(entry.first, entry.second); } } #if TRT_VERSION < 10000 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) void* CudaOutputAllocator::reallocateOutput(const char* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) TRT_NOEXCEPT { (void)alignment; if (!tensorName) { return nullptr; } std::lock_guard lock(mutex_); auto& alloc = allocations_[tensorName]; if (alloc.ptr && size <= alloc.size) { return alloc.ptr; } if (alloc.ptr) { release(tensorName, alloc); } else if (currentMemory != nullptr && size == 0) { return currentMemory; } Allocation fresh = allocate(static_cast(size)); if (!fresh.ptr) { return nullptr; } alloc = fresh; return alloc.ptr; } #else // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) void* CudaOutputAllocator::reallocateOutputAsync(const char* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) TRT_NOEXCEPT { (void)alignment; if (!tensorName) { return nullptr; } if (stream == nullptr) { stream = stream_; } stream_ = stream; std::lock_guard lock(mutex_); auto& alloc = allocations_[tensorName]; if (alloc.ptr && size <= alloc.size) { return alloc.ptr; } if (alloc.ptr) { release(tensorName, alloc); } else if (currentMemory != nullptr && size == 0) { return currentMemory; } Allocation fresh = allocate(static_cast(size)); if (!fresh.ptr) { return nullptr; } alloc = fresh; return alloc.ptr; } #endif void CudaOutputAllocator::notifyShape(const char* /*tensorName*/, nvinfer1::Dims const& /*dims*/) TRT_NOEXCEPT {} CudaOutputAllocator::Allocation CudaOutputAllocator::allocate(std::size_t size) { Allocation alloc{}; if (size == 0) { return alloc; } if (kind_ == OutputAllocKind::kCudaMallocAsync) { void* ptr = nullptr; if (cudaMallocAsync(&ptr, size, stream_) != cudaSuccess) { return alloc; } alloc.ptr = ptr; alloc.size = size; alloc.kind = OutputAllocKind::kCudaMallocAsync; return alloc; } if (kind_ == OutputAllocKind::kCudaMallocManaged) { void* ptr = nullptr; if (cudaMallocManaged(&ptr, size, cudaMemAttachGlobal) != cudaSuccess) { return alloc; } alloc.ptr = ptr; alloc.size = size; alloc.kind = OutputAllocKind::kCudaMallocManaged; return alloc; } if (cudaSetDevice(device_) != cudaSuccess) { return alloc; } if (cuInit(0) != CUDA_SUCCESS) { return alloc; } CUmemAllocationProp prop{}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = device_; std::size_t granularity = 0; if (cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) { return alloc; } const std::size_t alloc_size = ((size + granularity - 1) / granularity) * granularity; CUmemGenericAllocationHandle handle{}; if (cuMemCreate(&handle, alloc_size, &prop, 0) != CUDA_SUCCESS) { return alloc; } CUdeviceptr addr = 0; if (cuMemAddressReserve(&addr, alloc_size, 0, 0, 0) != CUDA_SUCCESS) { cuMemRelease(handle); return alloc; } if (cuMemMap(addr, alloc_size, 0, handle, 0) != CUDA_SUCCESS) { cuMemAddressFree(addr, alloc_size); cuMemRelease(handle); return alloc; } CUmemAccessDesc access_desc{}; access_desc.location = prop.location; access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; if (cuMemSetAccess(addr, alloc_size, &access_desc, 1) != CUDA_SUCCESS) { cuMemUnmap(addr, alloc_size); cuMemAddressFree(addr, alloc_size); cuMemRelease(handle); return alloc; } static_assert(sizeof(void*) == sizeof(CUdeviceptr)); alloc.ptr = reinterpret_cast(addr); // NOLINT(performance-no-int-to-ptr) alloc.size = size; alloc.kind = OutputAllocKind::kCuMem; alloc.handle = handle; alloc.addr = addr; alloc.mapped_size = alloc_size; return alloc; } void CudaOutputAllocator::release(const std::string& /*tensorName*/, Allocation& alloc) { if (!alloc.ptr) { return; } if (alloc.kind == OutputAllocKind::kCudaMallocAsync) { cudaFreeAsync(alloc.ptr, stream_); } else if (alloc.kind == OutputAllocKind::kCudaMallocManaged) { cudaFree(alloc.ptr); } else if (alloc.kind == OutputAllocKind::kCuMem) { cuMemUnmap(alloc.addr, alloc.mapped_size); cuMemRelease(alloc.handle); cuMemAddressFree(alloc.addr, alloc.mapped_size); } alloc = Allocation{}; } void* CudaOutputAllocator::getBuffer(const std::string& tensorName) const { std::lock_guard lock(mutex_); auto it = allocations_.find(tensorName); if (it == allocations_.end()) { return nullptr; } return it->second.ptr; } std::size_t CudaOutputAllocator::getSize(const std::string& tensorName) const { std::lock_guard lock(mutex_); auto it = allocations_.find(tensorName); if (it == allocations_.end()) { return 0; } return it->second.size; } OutputAllocKind CudaOutputAllocator::kind() const { return kind_; } ================================================ FILE: vit/cuda_allocator.h ================================================ #pragma once #include #include #include #include #include #include #include #include "macros.h" enum class OutputAllocKind : std::uint8_t { kCudaMallocAsync, kCudaMallocManaged, kCuMem }; class CudaOutputAllocator final : public nvinfer1::IOutputAllocator { public: static std::unique_ptr Create(cudaStream_t stream, int device = 0); explicit CudaOutputAllocator(cudaStream_t stream, OutputAllocKind kind, int device = 0); ~CudaOutputAllocator() override; #if TRT_VERSION < 10000 void* reallocateOutput(const char* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) TRT_NOEXCEPT override; #else void* reallocateOutputAsync(const char* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) TRT_NOEXCEPT override; #endif void notifyShape(const char* tensorName, nvinfer1::Dims const& dims) TRT_NOEXCEPT override; void* getBuffer(const std::string& tensorName) const; std::size_t getSize(const std::string& tensorName) const; OutputAllocKind kind() const; private: struct Allocation; Allocation allocate(std::size_t size); void release(const std::string& tensorName, Allocation& alloc); cudaStream_t stream_{}; OutputAllocKind kind_{OutputAllocKind::kCudaMallocManaged}; int device_{0}; mutable std::mutex mutex_; std::unordered_map allocations_; }; ================================================ FILE: vit/gen_wts.py ================================================ import struct import cv2 import numpy as np import torch from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification def read_imagenet_labels() -> dict[int, str]: """ read ImageNet 1000 labels Returns: dict[int, str]: labels dict """ clsid2label = {} with open("../assets/imagenet1000_clsidx_to_labels.txt", "r") as f: for i in f.readlines(): k, v = i.split(": ") clsid2label.setdefault(int(k), v[1:-3]) return clsid2label USE_HF_PREPROCESS = False if __name__ == "__main__": hub_model_id = "google/vit-base-patch16-224" config = AutoConfig.from_pretrained(hub_model_id) config._attn_implementation = "eager" model = AutoModelForImageClassification.from_pretrained( hub_model_id, ignore_mismatched_sizes=False, config=config, ) model.eval() img = cv2.imread("../assets/cats.jpg", cv2.IMREAD_COLOR) if USE_HF_PREPROCESS: image_processor = AutoImageProcessor.from_pretrained(hub_model_id) img = image_processor(img, return_tensors="pt") img = img["pixel_values"] else: img: np.array = cv2.resize(img, (224, 224), cv2.INTER_LINEAR) img = (img.astype(np.float32) / 255.0 - np.array([0.5, 0.5, 0.5])) / np.array([0.5, 0.5, 0.5]) img = torch.from_numpy(np.transpose(img, (2, 0, 1))[None, ...]) output = model(img) labels = read_imagenet_labels() for i, j in enumerate(torch.topk(output.logits[0], k=3).indices): print(f"Top: {i} is {labels[int(j)]}") f = open("../models/vit.wts", "w") f.write("{}\n".format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): print("key: ", k) print("value: ", v.shape) vr = v.reshape(-1).cpu().numpy() f.write("{} {}".format(k, len(vr))) for vv in vr: f.write(" ") f.write(struct.pack(">f", float(vv)).hex()) f.write("\n") ================================================ FILE: vit/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include #include #include "NvInferRuntime.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, std::string prefix, bool shouldLog) : mOutput(stream), mPrefix(std::move(prefix)), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept : mOutput(other.mOutput), mPrefix(std::move(other.mPrefix)), mShouldLog(other.mShouldLog) {} ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream int sync() override { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, std::string prefix, bool shouldLog) : mBuffer(stream, std::move(prefix), shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) noexcept : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { private: struct TestInfo; public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult : std::uint8_t { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << '\n'; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, TestInfo info) : mStarted(started), mName(std::move(info.name)), mCmdline(std::move(info.cmdline)) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom{false, TestInfo{name, cmdline}}; } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } [[nodiscard]] Severity getReportableSeverity() const { return mReportableSeverity; } private: struct TestInfo { std::string name; std::string cmdline; }; //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << '\n'; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kVERBOSE}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINFO}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kWARNING}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kERROR}; } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer{logger.getReportableSeverity(), Severity::kINTERNAL_ERROR}; } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: vit/macros.h ================================================ #pragma once #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #define TRT_VERSION \ ((NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSORRT_PATCH * 10) + NV_TENSORRT_BUILD) #if TRT_VERSION < 7220 #error "TensorRT >= 7.2.2 is required for this demo." #endif #if TRT_VERSION >= 8000 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif ================================================ FILE: vit/profiler.cc ================================================ #include "profiler.h" #include #include #include #include void Profiler::reportLayerTime(const char* layerName, float ms) noexcept { mProfile[layerName].count++; mProfile[layerName].time += ms; if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) { mLayerNames.emplace_back(layerName); } } Profiler::Profiler(const char* name, const std::vector& srcProfilers) : mName(name) { for (const auto& srcProfiler : srcProfilers) { for (const auto& rec : srcProfiler.mProfile) { auto it = mProfile.find(rec.first); if (it == mProfile.end()) { mProfile.insert(rec); } else { it->second.time += rec.second.time; it->second.count += rec.second.count; } } } } std::ostream& operator<<(std::ostream& out, const Profiler& value) { out << "========== " << value.mName << " ==========\n"; float totalTime = 0; std::string layerNameStr = "TensorRT layer name"; int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); for (const auto& elem : value.mProfile) { totalTime += elem.second.time; maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); } auto old_settings = out.flags(); auto old_precision = out.precision(); // Output header { out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " "; out << std::setw(12) << "Runtime, " << "%" << " "; out << std::setw(12) << "Invocations" << " "; out << std::setw(12) << "Runtime, ms\n"; } for (size_t i = 0; i < value.mLayerNames.size(); i++) { const std::string layerName = value.mLayerNames[i]; auto elem = value.mProfile.at(layerName); out << std::setw(maxLayerNameLength) << layerName << " "; out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" << " "; out << std::setw(12) << elem.count << " "; out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << "\n"; } out.flags(old_settings); out.precision(old_precision); out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========\n"; return out; } ================================================ FILE: vit/profiler.h ================================================ #include #include #include #include #include class Profiler final : public nvinfer1::IProfiler { public: struct Record { float time{0}; int count{0}; }; Profiler(const char* name, const std::vector& srcProfilers = std::vector()); void reportLayerTime(const char* layerName, float ms) noexcept override; friend std::ostream& operator<<(std::ostream& out, const Profiler& value); private: std::string mName; std::vector mLayerNames; std::map mProfile; }; ================================================ FILE: vit/utils.h ================================================ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "macros.h" constexpr const std::size_t WORKSPACE_SIZE = 16 << 20; namespace { #define CHECK(status) \ do { \ auto ret = (status); \ if (ret != cudaSuccess) { \ std::cerr << "Cuda failure: " << ret << "\n"; \ std::abort(); \ } \ } while (0) static void checkTrtEnv(int device = 0) { #if TRT_VERSION < 8000 CHECK(cudaGetDevice(&device)); cudaDeviceProp prop{}; CHECK(cudaGetDeviceProperties(&prop, device)); const int sm = prop.major * 10 + prop.minor; if (sm > 86) { std::cerr << "TensorRT < 8 does not support SM > 86 on this GPU."; std::abort(); } #endif } /** * @brief TensorRT weight files have a simple space delimited format: * [type] [size] * * @param file input weight file path * @return std::map */ static auto loadWeights(const std::string& file) { std::cout << "Loading weights: " << file << "\n"; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{.type = nvinfer1::DataType::kFLOAT, .values = nullptr, .count = 0}; // Read name and type of blob std::string name; input >> name >> std::dec >> wt.count; // Load blob auto* val = new uint32_t[wt.count]; input >> std::hex; for (auto x = 0ll; x < wt.count; ++x) { input >> val[x]; } wt.values = val; weightMap[name] = wt; } return weightMap; } /** * @brief a preprocess function aligning with ImageNet preprocess in torchvision, only support 3-channel image * * @param img opencv image with BGR layout * @param bgr2rgb whether to convert BGR to RGB * @param mean_std subtract mean, then divide std * @param n batch size * @param h resize height * @param w resize width * @return std::vector contiguous flatten image data in fp16 type (CHW) */ static auto preprocess_img(cv::Mat& img, bool bgr2rgb, const std::array& mean, const std::array& std, int64_t n, int32_t h, int32_t w) { const auto c = img.channels(); const auto size = c * h * w; if (c != 3) { std::cerr << "this demo only supports 3 channel input image.\n"; std::abort(); } if (bgr2rgb) { cv::cvtColor(img, img, cv::COLOR_BGR2RGB); } cv::resize(img, img, cv::Size(w, h), 0, 0, cv::INTER_LINEAR); // Keep preprocessing in fp32 on CPU for correctness, then pack to fp16 CHW for TensorRT input. img.convertTo(img, CV_32FC3, 1.f / 255.f); img = (img - cv::Scalar(mean[0], mean[1], mean[2])) / cv::Scalar(std[0], std[1], std[2]); std::vector chw(static_cast(n) * c * h * w); // fill all batch with the same input image for (int i = 0; i < n; ++i) { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const cv::Vec3f v = img.at(y, x); chw[i * size + 0 * h * w + y * w + x] = __float2half(v[0]); chw[i * size + 1 * h * w + y * w + x] = __float2half(v[1]); chw[i * size + 2 * h * w + y * w + x] = __float2half(v[2]); } } } return chw; } static auto topk(const std::vector& v, int k) -> std::vector> { if (k <= 0) return {}; auto stride = std::min(k, static_cast(v.size())); std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); std::partial_sort(idx.begin(), idx.begin() + stride, idx.end(), [&](int a, int b) { return v[a] > v[b]; }); std::vector> out; out.reserve(stride); for (int i = 0; i < stride; ++i) out.emplace_back(idx[i], v[idx[i]]); return out; } static auto loadImagenetLabelMap(const std::string& path) { std::map labels; std::ifstream in(path); if (!in.is_open()) { return labels; } std::string line; while (std::getline(in, line)) { auto colon = line.find(':'); if (colon == std::string::npos) { continue; } auto first_quote = line.find('\'', colon); if (first_quote == std::string::npos) { continue; } auto second_quote = line.find('\'', first_quote + 1); if (second_quote == std::string::npos) { continue; } int idx = std::stoi(line.substr(0, colon)); labels[idx] = line.substr(first_quote + 1, second_quote - first_quote - 1); } return labels; } } // namespace ================================================ FILE: vit/vit.cc ================================================ #include #include #include #include #include #include #include "cuda_allocator.h" #include "logging.h" #include "macros.h" #include "profiler.h" #include "utils.h" using namespace nvinfer1; using WeightMap = std::map; using M = nvinfer1::MatrixOperation; using E = nvinfer1::ElementWiseOperation; using NDCF = nvinfer1::NetworkDefinitionCreationFlag; static constexpr const int64_t N = 1; static constexpr const int64_t INPUT_H = 224; static constexpr const int64_t INPUT_W = 224; static constexpr const char* WTS_PATH = "../models/vit.wts"; static constexpr const char* ENGINE_PATH = "../models/vit.engine"; static constexpr const char* LABELS_PATH = "../assets/imagenet1000_clsidx_to_labels.txt"; static constexpr const std::array NAMES = {"input", "logits"}; static constexpr const std::array SIZES = {3 * INPUT_H * INPUT_W, 1000}; static constexpr const std::array mean = {0.5f, 0.5f, 0.5f}; static constexpr const std::array stdv = {0.5f, 0.5f, 0.5f}; static Logger gLogger; static auto bytesPerElement(DataType t) -> std::size_t { switch (t) { case DataType::kFLOAT: return 4; case DataType::kHALF: return 2; case DataType::kINT32: return 4; #if TRT_VERSION >= 8000 case DataType::kBOOL: #endif #if TRT_VERSION >= 8500 case DataType::kUINT8: #endif case DataType::kINT8: return 1; default: std::cerr << "Unsupported TensorRT DataType\n"; std::abort(); } } static void convertWeightMapToHalf(WeightMap& w) { for (auto& kv : w) { auto& wt = kv.second; if (wt.type != DataType::kFLOAT || wt.values == nullptr || wt.count <= 0) { continue; } auto* half_vals = new half[wt.count]; const auto* raw = reinterpret_cast(wt.values); for (int64_t i = 0; i < wt.count; ++i) { float f; std::memcpy(&f, &raw[i], sizeof(float)); half_vals[i] = __float2half(f); } delete[] raw; wt.type = DataType::kHALF; wt.values = half_vals; } } struct ViTParam { uint32_t index; uint32_t head_num; float lnorm_eps = 1e-12f; }; static auto addGeLU(INetworkDefinition* net, ITensor& input) -> ILayer* { #if TRT_VERSION < 10000 // tanh approximation: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) const auto inputDims = input.getDimensions(); Dims scalarDims{}; scalarDims.nbDims = inputDims.nbDims; for (int i = 0; i < scalarDims.nbDims; ++i) { scalarDims.d[i] = 1; } static float _half = 0.5f; static float _one = 1.0f; static float _sqrt_2_div_pi = std::sqrt(2.0f / M_PI); static float _coeff = 0.044715f; auto* _w_half = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_half, 1}); auto* _w_one = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_one, 1}); auto* _w_sqrt_2_div_pi = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_sqrt_2_div_pi, 1}); auto* _w_coeff = net->addConstant(scalarDims, Weights{DataType::kFLOAT, &_coeff, 1}); auto* _x2 = net->addElementWise(input, input, E::kPROD); auto* x3_0 = net->addElementWise(*_x2->getOutput(0), input, E::kPROD); auto* x3_1 = net->addElementWise(*x3_0->getOutput(0), *_w_coeff->getOutput(0), E::kPROD); auto* x3_2 = net->addElementWise(input, *x3_1->getOutput(0), E::kSUM); auto* scaled = net->addElementWise(*x3_2->getOutput(0), *_w_sqrt_2_div_pi->getOutput(0), E::kPROD); auto* t = net->addActivation(*scaled->getOutput(0), ActivationType::kTANH); auto* one_plus = net->addElementWise(*t->getOutput(0), *_w_one->getOutput(0), E::kSUM); auto* half_x = net->addElementWise(input, *_w_half->getOutput(0), E::kPROD); return net->addElementWise(*half_x->getOutput(0), *one_plus->getOutput(0), E::kPROD); #else // erf approximation return net->addActivation(input, ActivationType::kGELU_ERF); #endif } static auto addLinearNorm(INetworkDefinition* net, ITensor& input, ITensor& scale, ITensor& bias, uint32_t axesMask) noexcept -> ILayer* { #if TRT_VERSION >= 11500 auto* ln = net->addNormalizationV2(input, scale, bias, axesMask); #else auto* ln = net->addNormalization(input, scale, bias, axesMask); #endif ln->setEpsilon(1e-12f); return ln; } auto ViTLayer(INetworkDefinition* net, WeightMap& w, ITensor& input, const ViTParam& param) -> ITensor* { std::string name = "vit.encoder.layer." + std::to_string(param.index); auto attn_name = name + ".attention"; int64_t attn_head_size = 768LL / param.head_num; auto* qw = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".attention.query.weight")); auto* kw = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".attention.key.weight")); auto* vw = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".attention.value.weight")); /* 1. layer norm before attention */ auto pre_ln_name = name + ".layernorm_before"; auto dims = input.getDimensions(); uint32_t axes = 1U << static_cast(dims.nbDims - 1); auto* ln_scale = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[pre_ln_name + ".weight"]); auto* ln_bias = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[pre_ln_name + ".bias"]); auto* pre_lnorm = addLinearNorm(net, input, *ln_scale->getOutput(0), *ln_bias->getOutput(0), axes); /** 2. multi-head self-attention */ auto* qb = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".attention.query.bias")); auto* kb = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".attention.key.bias")); auto* vb = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".attention.value.bias")); auto* _lno = pre_lnorm->getOutput(0); // 2.1 Q, K attention matmul auto* _q_attn = net->addMatrixMultiply(*_lno, M::kNONE, *qw->getOutput(0), M::kTRANSPOSE); auto* _k_attn = net->addMatrixMultiply(*_lno, M::kNONE, *kw->getOutput(0), M::kTRANSPOSE); auto* _v_attn = net->addMatrixMultiply(*_lno, M::kNONE, *vw->getOutput(0), M::kTRANSPOSE); _q_attn->setName((attn_name + "query").c_str()); _k_attn->setName((attn_name + "key").c_str()); _v_attn->setName((attn_name + "value").c_str()); auto* q_attn = net->addElementWise(*_q_attn->getOutput(0), *qb->getOutput(0), E::kSUM); auto* k_attn = net->addElementWise(*_k_attn->getOutput(0), *kb->getOutput(0), E::kSUM); auto* v_attn = net->addElementWise(*_v_attn->getOutput(0), *vb->getOutput(0), E::kSUM); auto* q_s = net->addShuffle(*q_attn->getOutput(0)); auto* k_s = net->addShuffle(*k_attn->getOutput(0)); auto* v_s = net->addShuffle(*v_attn->getOutput(0)); q_s->setReshapeDimensions(Dims4{0, 0, param.head_num, attn_head_size}); q_s->setSecondTranspose({0, 2, 1, 3}); k_s->setReshapeDimensions(Dims4{0, 0, param.head_num, attn_head_size}); k_s->setSecondTranspose({0, 2, 1, 3}); v_s->setReshapeDimensions(Dims4{0, 0, param.head_num, attn_head_size}); v_s->setSecondTranspose({0, 2, 1, 3}); // 2.2 Q, K scaling (and softmax / fused attention) const float scale_f = 1.0f / std::sqrt(static_cast(attn_head_size)); if (input.getType() == DataType::kHALF) { auto* scale_val = new half[1]; scale_val[0] = __float2half(scale_f); w[attn_name + ".scale"] = Weights{.type = DataType::kHALF, .values = scale_val, .count = 1}; } else { auto* scale_val = new uint32_t[1]; std::memcpy(scale_val, &scale_f, sizeof(float)); w[attn_name + ".scale"] = Weights{.type = DataType::kFLOAT, .values = scale_val, .count = 1}; } auto* qk_scale_w = net->addConstant(Dims4{1, 1, 1, 1}, w.at(attn_name + ".scale")); // 2.3 QKV attention output and reshape #if TRT_VERSION >= 11400 && TRT_VERSION < 11500 gLogger.log(Severity::kWARNING, "IAttention is available in TensorRT 10.14.1 SDK but have bugs, use 10.15.1+ to enable native fused " "kernel"); #endif #if TRT_VERSION >= 11500 using ANO = AttentionNormalizationOp; auto* q_scaled = net->addElementWise(*q_s->getOutput(0), *qk_scale_w->getOutput(0), E::kPROD)->getOutput(0); auto* attn = net->addAttention(*q_scaled, *k_s->getOutput(0), *v_s->getOutput(0), ANO::kSOFTMAX, false); assert(attn != nullptr); auto status = attn->setDecomposable(false); assert(status); auto* attn_out = net->addShuffle(*attn->getOutput(0)); #else auto* qk = net->addMatrixMultiply(*q_s->getOutput(0), M::kNONE, *k_s->getOutput(0), M::kTRANSPOSE); auto* attn_qk = net->addElementWise(*qk->getOutput(0), *qk_scale_w->getOutput(0), E::kPROD); auto* qk_softmax = net->addSoftMax(*attn_qk->getOutput(0)); qk_softmax->setAxes(1U << static_cast(attn_qk->getOutput(0)->getDimensions().nbDims - 1)); auto* attn_qkv = net->addMatrixMultiply(*qk_softmax->getOutput(0), M::kNONE, *v_s->getOutput(0), M::kNONE); attn_qkv->setName((attn_name + ".attn_qkv").c_str()); auto* attn_out = net->addShuffle(*attn_qkv->getOutput(0)); #endif attn_out->setFirstTranspose({0, 2, 1, 3}); attn_out->setReshapeDimensions(Dims3{0, 0, 768}); // 2.4 attention output projection auto* out_proj_w = net->addConstant(Dims3{1, 768, 768}, w.at(attn_name + ".output.dense.weight"))->getOutput(0); auto* out_proj_b = net->addConstant(Dims3{1, 1, 768}, w.at(attn_name + ".output.dense.bias"))->getOutput(0); auto* attn_fcw = net->addMatrixMultiply(*attn_out->getOutput(0), M::kNONE, *out_proj_w, M::kTRANSPOSE); auto* attn_fcb = net->addElementWise(*attn_fcw->getOutput(0), *out_proj_b, E::kSUM); attn_fcb->setName((attn_name + ".out_proj").c_str()); /** 3. attention and hidden state residual connection */ auto* attn_residual = net->addElementWise(input, *attn_fcb->getOutput(0), E::kSUM); attn_residual->setName((name + "attn_residual").c_str()); /** 4. layer norm after attention */ auto post_ln_name = name + ".layernorm_after"; ln_scale = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[post_ln_name + ".weight"]); ln_bias = net->addConstant(Dims3{1, 1, dims.d[dims.nbDims - 1]}, w[post_ln_name + ".bias"]); auto* _res = attn_residual->getOutput(0); axes = 1U << static_cast(_res->getDimensions().nbDims - 1); auto* post_lnorm = addLinearNorm(net, *_res, *ln_scale->getOutput(0), *ln_bias->getOutput(0), axes); /** 6. intermediate (feed-forward) layer and activation */ auto intermediate_name = name + ".intermediate.dense"; std::cout << "Building: " << intermediate_name << "\n"; auto* iw = net->addConstant(Dims3{1, 3072, 768}, w[intermediate_name + ".weight"]); auto* ib = net->addConstant(Dims3{1, 1, 3072}, w[intermediate_name + ".bias"]); ib->setName((intermediate_name + ".bias").c_str()); auto* inter0 = net->addMatrixMultiply(*post_lnorm->getOutput(0), M::kNONE, *iw->getOutput(0), M::kTRANSPOSE); auto* inter1 = net->addElementWise(*inter0->getOutput(0), *ib->getOutput(0), E::kSUM); auto* inter_act = addGeLU(net, *inter1->getOutput(0)); /** 7. output projection */ auto output_name = name + ".output.dense"; std::cout << "Building: " << output_name << "\n"; auto* ow = net->addConstant(Dims3{1, 768, 3072}, w[output_name + ".weight"]); auto* ob = net->addConstant(Dims3{1, 1, 768}, w[output_name + ".bias"]); ob->setName((output_name + ".bias").c_str()); auto* out0 = net->addMatrixMultiply(*inter_act->getOutput(0), M::kNONE, *ow->getOutput(0), M::kTRANSPOSE); auto* out1 = net->addElementWise(*out0->getOutput(0), *ob->getOutput(0), E::kSUM); /** 8. residual */ auto* output_residual = net->addElementWise(*out1->getOutput(0), *attn_residual->getOutput(0), E::kSUM); output_residual->setName((name + ".output_residual").c_str()); return output_residual->getOutput(0); } // Creat the engine using only the API without any parser. auto createEngine(int64_t N, IRuntime* runtime, IBuilder* builder, IBuilderConfig* config, DataType dt) -> ICudaEngine* { WeightMap w = loadWeights(WTS_PATH); if (dt == DataType::kHALF) { convertWeightMapToHalf(w); } #if TRT_VERSION >= 10000 auto* net = builder->createNetworkV2(1U << static_cast(NDCF::kSTRONGLY_TYPED)); #else auto* net = builder->createNetworkV2(1U << static_cast(NDCF::kEXPLICIT_BATCH)); #endif // 1. patch embedding ITensor* data = net->addInput(NAMES[0], dt, Dims4{-1, 3, INPUT_H, INPUT_W}); std::string name = "vit.embeddings.patch_embeddings.projection."; auto* embed = net->addConvolutionNd(*data, 768, DimsHW{16, 16}, w[name + "weight"], w[name + "bias"]); embed->setName("patch embedding"); embed->setStrideNd(DimsHW{16, 16}); auto* s = net->addShuffle(*embed->getOutput(0)); s->setReshapeDimensions(Dims3{0, 768, 14LL * 14}); s->setSecondTranspose({0, 2, 1}); // 2. add cls token and position embedding auto* cls_token = net->addConstant(Dims3{1, 1, 768}, w["vit.embeddings.cls_token"]); auto* pos_embed = net->addConstant(Dims3{1, 197, 768}, w["vit.embeddings.position_embeddings"]); const std::array _cat = {cls_token->getOutput(0), s->getOutput(0)}; auto* cat = net->addConcatenation(_cat.data(), 2); cat->setAxis(1); cat->setName("cat_clstoken_embed"); auto* pos_added = net->addElementWise(*cat->getOutput(0), *pos_embed->getOutput(0), ElementWiseOperation::kSUM); pos_added->setName("position_embed"); // 3. transformer encoder layers ITensor* input = pos_added->getOutput(0); for (auto i = 0u; i < 12; i++) { auto* vit = ViTLayer(net, w, *input, {.index = i, .head_num = 12, .lnorm_eps = 1e-12f}); input = vit; } // 4. layer norm after transformer encoder auto* ln_scale = net->addConstant(Dims3{1, 1, 768}, w["vit.layernorm.weight"]); auto* ln_bias = net->addConstant(Dims3{1, 1, 768}, w["vit.layernorm.bias"]); uint32_t axes = 1U << static_cast(input->getDimensions().nbDims - 1); auto* post_lnorm = addLinearNorm(net, *input, *ln_scale->getOutput(0), *ln_bias->getOutput(0), axes); // 6. classifier head auto* slice = net->addSlice(*post_lnorm->getOutput(0), Dims3{0, 0, 0}, Dims3{N, 1, 768}, Dims3{1, 1, 1}); auto* shuffle = net->addShuffle(*slice->getOutput(0)); shuffle->setReshapeDimensions(Dims2{N, 768}); auto* cls_w = net->addConstant(DimsHW{1000, 768}, w["classifier.weight"]); auto* cls_b = net->addConstant(DimsHW{1, 1000}, w["classifier.bias"]); auto* cls_0 = net->addMatrixMultiply(*shuffle->getOutput(0), M::kNONE, *cls_w->getOutput(0), M::kTRANSPOSE); auto* cls_1 = net->addElementWise(*cls_b->getOutput(0), *cls_0->getOutput(0), E::kSUM); net->markOutput(*cls_1->getOutput(0)); Dims4 _min{1, 3, INPUT_H, INPUT_W}, _opt{N, 3, INPUT_H, INPUT_W}, _max{2 * N, 3, INPUT_H, INPUT_W}; #if TRT_VERSION >= 8000 config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, WORKSPACE_SIZE); config->setHardwareCompatibilityLevel(HardwareCompatibilityLevel::kAMPERE_PLUS); auto* profile = builder->createOptimizationProfile(); profile->setDimensions(NAMES[0], OptProfileSelector::kMIN, _min); profile->setDimensions(NAMES[0], OptProfileSelector::kOPT, _opt); profile->setDimensions(NAMES[0], OptProfileSelector::kMAX, _max); config->addOptimizationProfile(profile); IHostMemory* mem = builder->buildSerializedNetwork(*net, *config); ICudaEngine* engine = runtime->deserializeCudaEngine(mem->data(), mem->size()); delete net; #else builder->setMaxBatchSize(N); config->setMaxWorkspaceSize(WORKSPACE_SIZE); ICudaEngine* engine = builder->buildEngineWithConfig(*net, *config); net->destroy(); #endif std::cout << "build finished\n"; // Release host memory for (auto& mem : w) { if (mem.second.values == nullptr) { continue; } if (mem.second.type == DataType::kHALF) { delete[] reinterpret_cast(mem.second.values); } else { // loadWeights() allocates with new uint32_t[] delete[] reinterpret_cast(mem.second.values); } } return engine; } std::vector> doInference(IExecutionContext& context, __half* input, std::size_t batchSize) { const ICudaEngine& engine = context.getEngine(); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); std::vector buffers; #if TRT_VERSION >= 10000 auto allocator = CudaOutputAllocator::Create(stream); #endif #if TRT_VERSION >= 8000 const int32_t nIO = engine.getNbIOTensors(); #else const int32_t nIO = engine.getNbBindings(); #endif buffers.resize(nIO, nullptr); for (auto i = 0; i < nIO; ++i) { #if TRT_VERSION >= 8000 // TensorRT 8+ use name based SDK auto* tensor_name = engine.getIOTensorName(i); const auto dtype = engine.getTensorDataType(tensor_name); std::size_t size = batchSize * SIZES[i] * bytesPerElement(dtype); #if TRT_VERSION >= 10000 // TensorRT 10+ use outuput allocator if (i == 0) { CHECK(cudaMalloc(&buffers[i], size)); CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); context.setTensorAddress(tensor_name, buffers[i]); } else { context.setOutputAllocator(tensor_name, allocator.get()); } #else if (i != 0) { CHECK(cudaMalloc(&buffers[i], size)); } else { CHECK(cudaMalloc(&buffers[i], size)); CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } context.setTensorAddress(tensor_name, buffers[i]); #endif #else std::size_t size = batchSize * SIZES[i] * sizeof(float); const int32_t idx = engine.getBindingIndex(NAMES[i]); assert(idx == i); CHECK(cudaMalloc(&buffers[i], size)); if (i == 0) { CHECK(cudaMemcpyAsync(buffers[i], input, size, cudaMemcpyHostToDevice, stream)); } #endif } #if TRT_VERSION >= 8000 assert(context.enqueueV3(stream)); #else assert(context.enqueueV2(buffers.data(), stream, nullptr)); #endif std::vector> prob; for (int i = 1; i < nIO; ++i) { #if TRT_VERSION >= 10000 auto* tensor_name = engine.getIOTensorName(i); const auto dtype = engine.getTensorDataType(tensor_name); std::size_t size = batchSize * SIZES[i] * bytesPerElement(dtype); void* out_ptr = allocator->getBuffer(tensor_name); // D2H data transfer if (dtype == DataType::kHALF) { std::vector<__half> tmp_h(batchSize * SIZES[i]); CHECK(cudaMemcpyAsync(tmp_h.data(), out_ptr, size, cudaMemcpyDeviceToHost, stream)); CHECK(cudaStreamSynchronize(stream)); std::vector tmp(batchSize * SIZES[i]); for (std::size_t j = 0; j < tmp.size(); ++j) { tmp[j] = __half2float(tmp_h[j]); } prob.emplace_back(std::move(tmp)); } else { std::vector tmp(batchSize * SIZES[i], std::nanf("")); CHECK(cudaMemcpyAsync(tmp.data(), out_ptr, size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(std::move(tmp)); } #else std::vector tmp(batchSize * SIZES[i], std::nanf("")); std::size_t size = batchSize * SIZES[i] * sizeof(float); CHECK(cudaMemcpyAsync(tmp.data(), buffers[i], size, cudaMemcpyDeviceToHost, stream)); prob.emplace_back(std::move(tmp)); #endif } CHECK(cudaStreamSynchronize(stream)); for (auto& buffer : buffers) { if (buffer != nullptr) { CHECK(cudaFree(buffer)); } } #if TRT_VERSION >= 10000 allocator.reset(); #endif CHECK(cudaStreamDestroy(stream)); return prob; } void APIToModel(int32_t N, IRuntime* runtime, IHostMemory** modelStream) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); ICudaEngine* engine = createEngine(N, runtime, builder, config, DataType::kHALF); assert(engine != nullptr); (*modelStream) = engine->serialize(); #if TRT_VERSION >= 8000 delete engine; delete config; delete builder; #else engine->destroy(); config->destroy(); builder->destroy(); #endif } auto main(int argc, char** argv) -> int { std::cout << "TensorRT version: " << TRT_VERSION << "\n"; if (argc != 2) { std::cerr << "arguments not right!\n"; std::cerr << "./vit -s // serialize model to plan file\n"; std::cerr << "./vit -d // deserialize plan file and run inference\n"; return 1; } #ifndef NDEBUG gLogger.setReportableSeverity(nvinfer1::ILogger::Severity::kVERBOSE); #endif IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); char* trtModelStream{nullptr}; std::streamsize size{0}; if (std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(N, runtime, &modelStream); assert(modelStream != nullptr); std::ofstream p(ENGINE_PATH, std::ios::binary | std::ios::trunc); if (!p) { std::cerr << "could not open plan output file\n"; return -1; } if (modelStream->size() > static_cast(std::numeric_limits::max())) { std::cerr << "this model is too large to serialize\n"; return -1; } const auto* data_ptr = reinterpret_cast(modelStream->data()); auto data_size = static_cast(modelStream->size()); p.write(data_ptr, data_size); #if TRT_VERSION >= 8000 delete modelStream; #else modelStream->destroy(); #endif return 0; } else if (std::string(argv[1]) == "-d") { std::ifstream file(ENGINE_PATH, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } else { std::cerr << "read engine file error!\n"; return -1; } #if TRT_VERSION >= 8000 ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); #else ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); #endif assert(engine != nullptr); auto* context = engine->createExecutionContext(); assert(context != nullptr); // VIT use default BGR order auto img = cv::imread("../assets/cats.jpg", cv::IMREAD_COLOR); auto input = preprocess_img(img, false, mean, stdv, N, INPUT_H, INPUT_W); Profiler profiler("VisionTransformerProfiler"); // Warmup: run a few iterations without profiling. for (int i = 0; i < 5; ++i) { (void)doInference(*context, input.data(), N); } // Profiled runs context->setProfiler(&profiler); for (int i = 0; i < 20; ++i) { auto start = std::chrono::system_clock::now(); auto prob = doInference(*context, input.data(), N); auto end = std::chrono::system_clock::now(); auto period = std::chrono::duration_cast(end - start); std::cout << period.count() << "us\n"; for (const auto& vector : prob) { int idx = 0; for (auto v : vector) { std::cout << std::setprecision(4) << v << ", " << std::flush; if (++idx > 20) { std::cout << "\n====\n"; break; } } } if (i == 19) { std::cout << "prediction result: \n"; auto labels = loadImagenetLabelMap(LABELS_PATH); int _top = 0; for (auto& [idx, logits] : topk(prob[0], 3)) { std::cout << "Top: " << _top++ << " idx: " << idx << ", logits: " << logits << ", label: " << labels[idx] << "\n"; } std::cout << profiler << "\n"; } } return 0; } return 0; } ================================================ FILE: yolo11/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov11) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/workspace/shared/TensorRT-8.6.1.6/include) link_directories(/workspace/shared/TensorRT-8.6.1.6/lib) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolo11_det ${PROJECT_SOURCE_DIR}/yolo11_det.cpp ${SRCS}) target_link_libraries(yolo11_det nvinfer) target_link_libraries(yolo11_det cudart) target_link_libraries(yolo11_det myplugins) target_link_libraries(yolo11_det ${OpenCV_LIBS}) add_executable(yolo11_cls ${PROJECT_SOURCE_DIR}/yolo11_cls.cpp ${SRCS}) target_link_libraries(yolo11_cls nvinfer) target_link_libraries(yolo11_cls cudart) target_link_libraries(yolo11_cls myplugins) target_link_libraries(yolo11_cls ${OpenCV_LIBS}) add_executable(yolo11_seg ${PROJECT_SOURCE_DIR}/yolo11_seg.cpp ${SRCS}) target_link_libraries(yolo11_seg nvinfer) target_link_libraries(yolo11_seg cudart) target_link_libraries(yolo11_seg myplugins) target_link_libraries(yolo11_seg ${OpenCV_LIBS}) add_executable(yolo11_pose ${PROJECT_SOURCE_DIR}/yolo11_pose.cpp ${SRCS}) target_link_libraries(yolo11_pose nvinfer) target_link_libraries(yolo11_pose cudart) target_link_libraries(yolo11_pose myplugins) target_link_libraries(yolo11_pose ${OpenCV_LIBS}) add_executable(yolo11_obb ${PROJECT_SOURCE_DIR}/yolo11_obb.cpp ${SRCS}) target_link_libraries(yolo11_obb nvinfer) target_link_libraries(yolo11_obb cudart) target_link_libraries(yolo11_obb myplugins) target_link_libraries(yolo11_obb ${OpenCV_LIBS}) ================================================ FILE: yolo11/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False) # Load FP32 weights model = model['ema' if model.get('ema') else 'model'].float() if m_type in ['detect', 'seg', 'pose', 'obb']: anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] delattr(model.model[-1], 'anchors') model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolo11/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" std::map loadWeights(const std::string file); nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb); nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool c3k, bool shortcut, float e, std::string lname); nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname); nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); ================================================ FILE: yolo11/include/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolo11/include/config.h ================================================ #define USE_FP16 // #define USE_FP32 // #define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static char* kProtoTensorName = "proto"; const static int kNumClass = 80; const static int kPoseNumClass = 1; const static int kNumberOfPoints = 17; // number of keypoints total // obb model's number of classes constexpr static int kObbNumClass = 15; const static int kObbNe = 1; // number of extra parameters const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static int kObbInputH = 1024; const static int kObbInputW = 1024; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static float kConfThreshKeypoints = 0.5f; // keypoints confidence const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; //Quantization input image folder path const static char* kInputQuantizationFolder = "./coco_calib"; // Classfication model's number of classes constexpr static int kClsNumClass = 1000; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; ================================================ FILE: yolo11/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolo11/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolo11/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolo11/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolo11Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, std::string& type, int max_channels); nvinfer1::IHostMemory* buildEngineYolo11Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolo11Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolo11Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolo11Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); ================================================ FILE: yolo11/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" // Preprocessing functions cv::Rect get_rect(cv::Mat& img, float bbox[4]); // Processing functions void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); // NMS functions void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms_obb(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); // CUDA-related functions void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); // Drawing functions void draw_bbox(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); ================================================ FILE: yolo11/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolo11/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; float mask[32]; float keypoints[kNumberOfPoints * 3]; // 17*3 keypoints float angle; // obb angle }; struct AffineMatrix { float value[6]; }; const int bbox_element = sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolo11/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); // std::cout << "Found file: " << cur_file_name << std::endl; file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolo11/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength) { mClassCount = classCount; mNumberofpoints = numberofpoints; mConfthreshkeypoints = confthreshkeypoints; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; is_pose_ = is_pose; is_obb_ = is_obb; } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mNumberofpoints); read(d, mConfthreshkeypoints); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } read(d, is_segmentation_); read(d, is_pose_); read(d, is_obb_); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mNumberofpoints); write(d, mConfthreshkeypoints); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } write(d, is_segmentation_); write(d, is_pose_); write(d, is_obb_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, bool is_segmentation, bool is_pose, bool is_obb) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; const int N_kpts = nk; int total_grid = grid_h * grid_w; int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0); int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; if (is_segmentation) { for (int k = 0; k < 32; ++k) { det->mask[k] = curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid]; } } if (is_pose) { for (int kpt = 0; kpt < N_kpts; kpt++) { int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid; int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid; int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid; float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; bool is_within_bbox = kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; if (kpt_confidence < confkeypoints || !is_within_bbox) { det->keypoints[kpt * 3] = -1; det->keypoints[kpt * 3 + 1] = -1; det->keypoints[kpt * 3 + 2] = -1; } else { det->keypoints[kpt * 3] = kpt_x; det->keypoints[kpt * 3 + 1] = kpt_y; det->keypoints[kpt * 3 + 2] = kpt_confidence; } } } if (is_obb) { double pi = CV_PI; auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + 0) * total_grid]; auto angle = (sigmoid(angle_inx) - 0.25f) * pi; auto cos1 = cos(angle); auto sin1 = sin(angle); auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2; auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2; auto x = xf * cos1 - yf * sin1; auto y = xf * sin1 + yf * cos1; float cx = (col + 0.5f + x) * stride; float cy = (row + 0.5f + y) * stride; float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride; float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride; det->bbox[0] = cx; det->bbox[1] = cy; det->bbox[2] = w1; det->bbox[3] = h1; det->angle = angle; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; // const int maxGrids = mStridesLength; // int grids[maxGrids][2]; // for (int i = 0; i < maxGrids; ++i) { // grids[i][0] = mYoloV8netHeight / mStrides[i]; // grids[i][1] = mYoloV8NetWidth / mStrides[i]; // } int maxGrids = mStridesLength; int flatGridsLen = 2 * maxGrids; int* flatGrids = new int[flatGridsLen]; for (int i = 0; i < maxGrids; ++i) { flatGrids[2 * i] = mYoloV8netHeight / mStrides[i]; flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { // Access the elements of the original 2D array from the flattened 1D array int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements if (numElem < mThreadCount) // Adjust the thread count if needed mThreadCount = numElem; // The CUDA kernel call remains unchanged CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_); } delete[] flatGrids; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int netinfo_count = 9; int class_count = combinedInfo[0]; int numberofpoints = combinedInfo[1]; float confthreshkeypoints = combinedInfo[2]; int input_w = combinedInfo[3]; int input_h = combinedInfo[4]; int max_output_object_count = combinedInfo[5]; bool is_segmentation = combinedInfo[6]; bool is_pose = combinedInfo[7]; bool is_obb = combinedInfo[8]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolo11/plugin/yololayer.h ================================================ #pragma once #include #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; int mNumberofpoints; float mConfthreshkeypoints; int mYoloV8NetWidth; int mYoloV8netHeight; int mMaxOutObject; bool is_segmentation_; bool is_pose_; bool is_obb_; int* mStrides; int mStridesLength; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolo11/readme.md ================================================ ## Introduction Yolo11 model supports TensorRT-8. Training code [link](https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.38.zip) ## Environment * cuda 11.8 * cudnn 8.9.1.23 * tensorrt 8.6.1.6 * opencv 4.8.0 * ultralytics 8.3.0 ## Support * [x] YOLO11-det support FP32/FP16/INT8 and Python/C++ API * [x] YOLO11-cls support FP32/FP16/INT8 and Python/C++ API * [x] YOLO11-seg support FP32/FP16/INT8 and Python/C++ API * [x] YOLO11-pose support FP32/FP16/INT8 and Python/C++ API * [x] YOLO11-obb support FP32/FP16/INT8 and Python/C++ API ## Config * Choose the YOLO11 sub-model n/s/m/l/x from command line arguments. * Other configs please check [src/config.h](src/config.h) ## Build and Run 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # Download ultralytics wget https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.0.zip -O ultralytics-8.3.0.zip # Unzip ultralytics unzip ultralytics-8.3.0.zip cd ultralytics-8.3.0 # Download models wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt -O yolo11n.pt wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-cls.pt -O yolo11n-cls.pt wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-seg.pt -O yolo11n-seg.pt wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-pose.pt -O yolo11n-pose.pt wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-obb.pt -O yolo11n-obb.pt # Generate .wts cp [PATH-TO-TENSORRTX]/yolo11/gen_wts.py . python gen_wts.py -w yolo11n.pt -o yolo11n.wts -t detect python gen_wts.py -w yolo11n-cls.pt -o yolo11n-cls.wts -t cls python gen_wts.py -w yolo11n-seg.pt -o yolo11n-seg.wts -t seg python gen_wts.py -w yolo11n-pose.pt -o yolo11n-pose.wts -t pose python gen_wts.py -w yolo11n-obb.pt -o yolo11n-obb.wts -t obb # A file 'yolo11n.wts' will be generated. ``` 2. build tensorrtx/yolo11 and run ```shell cd [PATH-TO-TENSORRTX]/yolo11 mkdir build cd build cmake .. make ``` ### Detection ```shell cp [PATH-TO-ultralytics]/yolo11n.wts . # Build and serialize TensorRT engine ./yolo11_det -s yolo11n.wts yolo11n.engine [n/s/m/l/x] # Run inference ./yolo11_det -d yolo11n.engine ../images [c/g] # results saved in build directory ``` ### Classification ```shell cp [PATH-TO-ultralytics]/yolo11n-cls.wts . # Build and serialize TensorRT engine ./yolo11_cls -s yolo11n-cls.wts yolo11n-cls.engine [n/s/m/l/x] # Download ImageNet labels wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt # Run inference ./yolo11_cls -d yolo11n-cls.engine ../images ``` ### Segmentation ```shell cp [PATH-TO-ultralytics]/yolo11n-seg.wts . # Build and serialize TensorRT engine ./yolo11_seg -s yolo11n-seg.wts yolo11n-seg.engine [n/s/m/l/x] # Download the labels file wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt # Run inference ./yolo11_seg -d yolo11n-seg.engine ../images c coco.txt ``` ### Pose ```shell cp [PATH-TO-ultralytics]/yolo11n-pose.wts . # Build and serialize TensorRT engine ./yolo11_pose -s yolo11n-pose.wts yolo11n-pose.engine [n/s/m/l/x] # Run inference ./yolo11_pose -d yolo11n-pose.engine ../images ``` ### Obb ```shell cp [PATH-TO-ultralytics]/yolo11n-obb.wts . # Build and serialize TensorRT engine ./yolo11_obb -s yolo11n-obb.wts yolo11n-obb.engine [n/s/m/l/x] # Download the image wget -O P0015.png https://github.com/mpj1234/YOLO11-series-TensorRT8/releases/download/images/P0015.png mv P0015.png ../images # Run inference ./yolo11_obb -d yolo11n-obb.engine ../images ``` 3. Optional, load and run the tensorrt model in Python ```shell // Install python-tensorrt, pycuda, etc. // Ensure the yolo11n.engine python yolo11_det_trt.py ./build/yolo11n.engine ./build/libmyplugins.so # faq: in windows bug pycuda._driver.LogicError # faq: in linux bug Segmentation fault # Add the following code to the py file: # import pycuda.autoinit # import pycuda.driver as cuda ``` ## INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolo11/build 3. set the macro `USE_INT8` in src/config.h and make again 4. serialize the model and test ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolo11/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "model.h" #include "yololayer.h" std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; } return WeightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, std::vector k1, std::vector k2, float e, std::string lname) { int c_ = (int)((float)c2 * e); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2"); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return conv2; } nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname) { int c_ = c1 / 2; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool3 = network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) { nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); softmax->setAxes(1 << 1); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const int netinfo_count = 9; // Assuming the first 5 elements are for netinfo as per existing code. const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. std::vector combinedInfo(total_count); int class_num = kNumClass; if (is_pose) class_num = kPoseNumClass; else if (is_obb) class_num = kObbNumClass; int input_w = kInputW; if (is_obb) input_w = kObbInputW; int input_h = kInputH; if (is_obb) input_h = kObbInputH; // Fill in the first 5 elements as per existing netinfo. combinedInfo[0] = class_num; combinedInfo[1] = kNumberOfPoints; combinedInfo[2] = kConfThreshKeypoints; combinedInfo[3] = input_w; combinedInfo[4] = input_h; combinedInfo[5] = kMaxNumOutputBbox; combinedInfo[6] = is_segmentation; combinedInfo[7] = is_pose; combinedInfo[8] = is_obb; // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); // Now let's create the PluginField object to hold this combined information. nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); // Create the PluginFieldCollection to hold the PluginField object. nvinfer1::PluginFieldCollection pluginFieldCollection; pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array pluginFieldCollection.fields = &pluginField; // Create the plugin object using the PluginFieldCollection. nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // We assume that the plugin is to be added onto the network. // Prepare input tensors for the YOLO Layer. std::vector inputTensors; for (auto det : dets) { inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. } // Add the plugin to the network using the prepared input tensors. nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); return yoloLayer; // Return the added YOLO layer. } static nvinfer1::ILayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, std::vector k1, std::vector k2, float e, std::string lname) { int c_ = (int)((float)c2 * e); auto cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); auto cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2"); nvinfer1::ITensor* y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, k1, k2, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 2); auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3"); return cv3; } nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool c3k, bool shortcut, float e, std::string lname) { int c_ = (float)c2 * e; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* y1 = split2->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b; if (c3k) { b = C3k(network, weightMap, *y1, c_, c_, 2, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i)); } else { b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i)); } y1 = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } static nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g = 1) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); return bn; } static nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float attn_ratio, std::string lname) { int head_dim = dim / num_heads; int key_dim = head_dim * attn_ratio; float scale = pow(key_dim, -0.5); int nh_kd = key_dim * num_heads; int h = dim + nh_kd * 2; auto d = input.getDimensions(); int B = d.d[0]; int H = d.d[2]; int W = d.d[3]; int N = H * W; auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv"); // qkv.view(B, self.num_heads, -1, N) auto shuffle = network->addShuffle(*qkv->getOutput(0)); shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N}); // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2) auto d1 = shuffle->getOutput(0)->getDimensions(); auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // attn = ((q.transpose(-2, -1) @ k) * self.scale) auto qT = network->addShuffle(*q->getOutput(0)); qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0), nvinfer1::MatrixOperation::kNONE); // There are not many memory leaks, and I will change it when I have time float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; nvinfer1::IScaleLayer* scaleLayer = network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); // attn = attn.softmax(dim=-1) nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0)); softmax->setAxes(1 << 3); // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W)) auto attnT = network->addShuffle(*softmax->getOutput(0)); attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0), nvinfer1::MatrixOperation::kNONE); auto reshape = network->addShuffle(*matmul2->getOutput(0)); reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); auto v_reshape = network->addShuffle(*v->getOutput(0)); v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False) auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim); auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // x = self.proj(x) // self.proj = Conv(dim, dim, 1, act=False) auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj"); return proj; } static nvinfer1::ILayer* PSABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, float attn_ratio, int num_heads, bool shortcut, std::string lname) { // x = x + self.attn(x) if self.add else self.attn(x) auto attn = Attention(network, weightMap, input, dim, num_heads, attn_ratio, lname + ".attn"); nvinfer1::ILayer* shortcut_layer = nullptr; if (shortcut) { shortcut_layer = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); } else { shortcut_layer = attn; } // self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False)) // x = x + self.ffn(x) if self.add else self.ffn(x) auto ffn0 = convBnSiLU(network, weightMap, *shortcut_layer->getOutput(0), dim * 2, {1, 1}, 1, lname + ".ffn.0"); auto ffn1 = convBn(network, weightMap, *ffn0->getOutput(0), dim, 1, 1, lname + ".ffn.1"); if (shortcut) { return network->addElementWise(*shortcut_layer->getOutput(0), *ffn1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); } else { return ffn1; } } nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname) { assert(network != nullptr); int c = c1 * e; // cv1 branch nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c, {1, 1}, 1, lname + ".cv1"); nvinfer1::ITensor* cv1_out = conv1->getOutput(0); // Split the output of cv1 into two tensors nvinfer1::Dims dims = cv1_out->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // Create y1 bottleneck sequence nvinfer1::ITensor* y = split2->getOutput(0); for (int i = 0; i < n; ++i) { auto* bottleneck_layer = PSABlock(network, weightMap, *y, c, 0.5, c / 64, true, lname + ".m." + std::to_string(i)); y = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck } // Concatenate y1 with the second split of cv1 nvinfer1::ITensor* concatInputs[2] = {split1->getOutput(0), y}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2); // cv2 to produce the final output nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setNbGroups(ch); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } ================================================ FILE: yolo11/src/calibrator.cpp ================================================ #include "calibrator.h" #include #include #include #include #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolo11/src/model.cpp ================================================ #include #include #include "block.h" #include "calibrator.h" #include "config.h" #include "model.h" static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = std::min(x, max_channels); channel = int(ceil((channel * gw) / divisor)) * divisor; return channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } nvinfer1::IHostMemory* buildEngineYolo11Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, std::string& type, int max_channels) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // ****************************************** YOLO11 INPUT ********************************************** nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW}); assert(data); // ***************************************** YOLO11 BACKBONE ******************************************** nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8"); auto* conv9 = C2PSA(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.9"); // ********************************************* YOLO11 HEAD ********************************************* auto conv_class = convBnSiLU(network, weightMap, *conv9->getOutput(0), 1280, {1, 1}, 1, "model.10.conv"); // Adjusted code nvinfer1::Dims dims = conv_class->getOutput(0)->getDimensions(); // Obtain the dimensions of the output of conv_class assert(dims.nbDims == 4); // Make sure there are exactly 3 dimensions (channels, height, width) nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{dims.d[2], dims.d[3]}); assert(pool2); // Fully connected layer declaration auto shuffle_0 = network->addShuffle(*pool2->getOutput(0)); shuffle_0->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, 1280}); auto linear_weight = weightMap["model.10.linear.weight"]; auto constant_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, 1280}, linear_weight); auto constant_bias = network->addConstant(nvinfer1::Dims2{kBatchSize, kClsNumClass}, weightMap["model.10.linear.bias"]); auto linear_matrix_multipy = network->addMatrixMultiply(*shuffle_0->getOutput(0), nvinfer1::MatrixOperation::kNONE, *constant_weight->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE); auto yolo = network->addElementWise(*linear_matrix_multipy->getOutput(0), *constant_bias->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); assert(yolo); // Set the name for the output tensor and mark it as network output yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Set the maximum batch size and workspace size config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); // Configuration according to the precision mode being used #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif // Begin building the engine; this may take a while std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Cleanup the network definition and allocated weights delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolo11Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO11 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO11 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); // 11233 bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10"); /******************************************************************************************************* ********************************************* YOLO11 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); nvinfer1::IElementWiseLayer* conv13 = C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); nvinfer1::IElementWiseLayer* conv16 = C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16"); nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); nvinfer1::IElementWiseLayer* conv19 = C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19"); nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); nvinfer1::IElementWiseLayer* conv22 = C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLO11 OUTPUT ****************************************** *******************************************************************************************************/ // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); // output0 nvinfer1::IElementWiseLayer* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0"); nvinfer1::IElementWiseLayer* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.23.cv3.0.0.0"); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1"); auto* conv23_cv3_0_1_0 = DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0"); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 nvinfer1::IElementWiseLayer* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0"); nvinfer1::IElementWiseLayer* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.23.cv3.1.0.0"); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1"); auto* conv23_cv3_1_1_0 = DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0"); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 nvinfer1::IElementWiseLayer* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0"); nvinfer1::IElementWiseLayer* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]); conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.23.cv3.2.0.0"); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1"); auto* conv23_cv3_2_1_0 = DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0"); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]); conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLO11 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2); cat22_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2); cat22_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2); cat22_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } static nvinfer1::IElementWiseLayer* convBnSiLUProto(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setName((lname + ".conv").c_str()); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); bn->setName((lname + ".bn").c_str()); // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized. // Error Code 10: Internal Error (Could not find any implementation for node // model.22.proto.cv3.conv + model.22.proto.cv3.sigmoid + PWN(PWN((Unnamed Layer* 353) [Activation]), PWN(model.22.proto.cv3.silu)).) #if defined(USE_INT8) nvinfer1::ITensor* inputTensors[] = {bn->getOutput(0)}; auto concat = network->addConcatenation(inputTensors, 1); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*concat->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(sigmoid); bn->setName((lname + ".sigmoid").c_str()); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*concat->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); ew->setName((lname + ".silu").c_str()); #else nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(sigmoid); bn->setName((lname + ".sigmoid").c_str()); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); ew->setName((lname + ".silu").c_str()); #endif return ew; } static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, float gw, int max_channels) { int mid_channel = get_width(256, gw, max_channels); auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, {3, 3}, 1, "model.23.proto.cv1"); // float *convTranpsose_bais = (float *) weightMap["model.23.proto.upsample.bias"].values; // int convTranpsose_bais_len = weightMap["model.23.proto.upsample.bias"].count; // nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len}; auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2}, weightMap["model.23.proto.upsample.weight"], weightMap["model.23.proto.upsample.bias"]); assert(convTranpsose); convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2}); convTranpsose->setPadding(nvinfer1::DimsHW{0, 0}); auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, {3, 3}, 1, "model.23.proto.cv2"); auto cv3 = convBnSiLUProto(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, "model.23.proto.cv3"); assert(cv3); return cv3; } static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw, const std::string& algo_type, int max_channels) { int nm_nk = 0; int c4 = 0; if (algo_type == "seg") { nm_nk = 32; c4 = std::max(get_width(256, gw, max_channels) / 4, nm_nk); } else if (algo_type == "pose") { nm_nk = kNumberOfPoints * 3; c4 = std::max(get_width(256, gw, max_channels) / 4, kNumberOfPoints * 3); } else if (algo_type == "obb") { nm_nk = kObbNe; c4 = std::max(get_width(256, gw, max_channels) / 4, nm_nk); } else { std::cerr << "Unknown algo type: " << algo_type << std::endl; return nullptr; } auto cv0 = convBnSiLU(network, weightMap, input, c4, {3, 3}, 1, lname + ".0"); auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), c4, {3, 3}, 1, lname + ".1"); float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values; int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count; nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len}; auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), nm_nk, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".2" + ".weight"], cv2_bais); cv2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0)); cv2_shuffle->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, nm_nk, grid_shape}); return cv2_shuffle; } nvinfer1::IHostMemory* buildEngineYolo11Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO11 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO11 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10"); /******************************************************************************************************* ********************************************* YOLO11 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); nvinfer1::IElementWiseLayer* conv13 = C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); nvinfer1::IElementWiseLayer* conv16 = C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16"); nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); nvinfer1::IElementWiseLayer* conv19 = C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19"); nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); nvinfer1::IElementWiseLayer* conv22 = C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLO11 OUTPUT ****************************************** *******************************************************************************************************/ // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); // output0 nvinfer1::IElementWiseLayer* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0"); nvinfer1::IElementWiseLayer* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.23.cv3.0.0.0"); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1"); auto* conv23_cv3_0_1_0 = DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0"); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 nvinfer1::IElementWiseLayer* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0"); nvinfer1::IElementWiseLayer* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.23.cv3.1.0.0"); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1"); auto* conv23_cv3_1_1_0 = DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0"); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 nvinfer1::IElementWiseLayer* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0"); nvinfer1::IElementWiseLayer* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]); conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.23.cv3.2.0.0"); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1"); auto* conv23_cv3_2_1_0 = DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0"); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]); conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLO11 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); // det0 auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv16->getOutput(0), "model.23.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg", max_channels); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0), proto_coef_0->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 3); cat23_dfl_0->setAxis(1); // det1 auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv19->getOutput(0), "model.23.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg", max_channels); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0), proto_coef_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 3); cat23_dfl_1->setAxis(1); // det2 auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv22->getOutput(0), "model.23.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg", max_channels); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0), proto_coef_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 3); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength, true, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); auto proto = Proto(network, weightMap, *conv16->getOutput(0), "model.23.proto", gw, max_channels); proto->getOutput(0)->setName(kProtoTensorName); network->markOutput(*proto->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolo11Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO11 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO11 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10"); /******************************************************************************************************* ********************************************* YOLO11 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); nvinfer1::IElementWiseLayer* conv13 = C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); nvinfer1::IElementWiseLayer* conv16 = C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16"); nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); nvinfer1::IElementWiseLayer* conv19 = C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19"); nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); nvinfer1::IElementWiseLayer* conv22 = C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLO11 OUTPUT ****************************************** *******************************************************************************************************/ // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kPoseNumClass, 100)); // output0 nvinfer1::IElementWiseLayer* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0"); nvinfer1::IElementWiseLayer* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.23.cv3.0.0.0"); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1"); auto* conv23_cv3_0_1_0 = DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0"); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 nvinfer1::IElementWiseLayer* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0"); nvinfer1::IElementWiseLayer* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.23.cv3.1.0.0"); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1"); auto* conv23_cv3_1_1_0 = DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0"); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 nvinfer1::IElementWiseLayer* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0"); nvinfer1::IElementWiseLayer* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]); conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.23.cv3.2.0.0"); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1"); auto* conv23_cv3_2_1_0 = DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0"); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]); conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLO11 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); /**************************************************************************************P3****************************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); // det0 auto shuffle_conv16 = cv4_conv_combined(network, weightMap, *conv16->getOutput(0), "model.23.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose", max_channels); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0), shuffle_conv16->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 3); cat23_dfl_0->setAxis(1); /********************************************************************************************P4**********************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); // det1 auto shuffle_conv19 = cv4_conv_combined(network, weightMap, *conv19->getOutput(0), "model.23.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose", max_channels); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0), shuffle_conv19->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 3); cat23_dfl_1->setAxis(1); /********************************************************************************************P5**********************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); // det2 auto shuffle_conv22 = cv4_conv_combined(network, weightMap, *conv22->getOutput(0), "model.23.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose", max_channels); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0), shuffle_conv22->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 3); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength, false, true, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolo11Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO11 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kObbInputH, kObbInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO11 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); // 11233 bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3K2(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3K2(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = C2PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10"); /******************************************************************************************************* ********************************************* YOLO11 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); nvinfer1::IElementWiseLayer* conv13 = C3K2(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); nvinfer1::IElementWiseLayer* conv16 = C3K2(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.16"); nvinfer1::IElementWiseLayer* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); nvinfer1::IElementWiseLayer* conv19 = C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.5, "model.19"); nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); nvinfer1::IElementWiseLayer* conv22 = C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLO11 OUTPUT ****************************************** *******************************************************************************************************/ // c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels // c4 = max(ch[0] // 4, self.ne) int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kObbNumClass, 100)); int c4 = std::max(get_width(256, gw, max_channels) / 4, kObbNe); // output0 nvinfer1::IElementWiseLayer* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.0"); nvinfer1::IElementWiseLayer* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd(*conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.0.2.weight"], weightMap["model.23.cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = DWConv(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.23.cv3.0.0.0"); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.0.1"); auto* conv23_cv3_0_1_0 = DWConv(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.0.1.0"); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.0.2.weight"], weightMap["model.23.cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 nvinfer1::IElementWiseLayer* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.0"); nvinfer1::IElementWiseLayer* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd(*conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.1.2.weight"], weightMap["model.23.cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = DWConv(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.23.cv3.1.0.0"); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.0.1"); auto* conv23_cv3_1_1_0 = DWConv(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.1.1.0"); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.1.2.weight"], weightMap["model.23.cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 nvinfer1::IElementWiseLayer* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.0"); nvinfer1::IElementWiseLayer* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.23.cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd(*conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv2.2.2.weight"], weightMap["model.23.cv2.2.2.bias"]); conv23_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_2_0_0 = DWConv(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.23.cv3.2.0.0"); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.0.1"); auto* conv23_cv3_2_1_0 = DWConv(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.cv3.2.1.0"); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.cv3.2.2.weight"], weightMap["model.23.cv3.2.2.bias"]); conv23_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLO11 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kObbInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kObbNumClass, (kObbInputH / strides[0]) * (kObbInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kObbInputH / strides[0]) * (kObbInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kObbNumClass, (kObbInputH / strides[0]) * (kObbInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kObbInputH / strides[0]) * (kObbInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kObbNumClass, (kObbInputH / strides[1]) * (kObbInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kObbInputH / strides[1]) * (kObbInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kObbNumClass, (kObbInputH / strides[1]) * (kObbInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kObbInputH / strides[1]) * (kObbInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kObbNumClass, (kObbInputH / strides[2]) * (kObbInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kObbInputH / strides[2]) * (kObbInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kObbNumClass, (kObbInputH / strides[2]) * (kObbInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kObbInputH / strides[2]) * (kObbInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); // det0 auto shuffle_conv16 = cv4_conv_combined(network, weightMap, *conv16->getOutput(0), "model.23.cv4.0", (kObbInputH / strides[0]) * (kObbInputW / strides[0]), gw, "obb", max_channels); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0), shuffle_conv16->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 3); cat23_dfl_0->setAxis(1); // det1 auto shuffle_conv19 = cv4_conv_combined(network, weightMap, *conv19->getOutput(0), "model.23.cv4.1", (kObbInputH / strides[1]) * (kObbInputW / strides[1]), gw, "obb", max_channels); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0), shuffle_conv19->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 3); cat23_dfl_1->setAxis(1); // det2 auto shuffle_conv22 = cv4_conv_combined(network, weightMap, *conv22->getOutput(0), "model.23.cv4.2", (kObbInputH / strides[2]) * (kObbInputW / strides[2]), gw, "obb", max_channels); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0), shuffle_conv22->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 3); cat23_dfl_2->setAxis(1); // yolo layer nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength, false, false, true); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kObbInputW, kObbInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolo11/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kObbInputW / (img.cols * 1.0); float r_h = kObbInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kObbInputH - r_w * img.rows) / 2; b = bbox[3] - (kObbInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kObbInputW - r_h * img.cols) / 2; r = bbox[2] - (kObbInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; // lmk[i + 2] } } else { l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; // lmk[i + 2] } } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), (std::min)(lbox[2], rbox[2]), (std::max)(lbox[1], rbox[1]), (std::min)(lbox[3], rbox[3]), }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; return interBoxS / unionBoxS; } static bool cmp(const Detection& a, const Detection& b) { if (a.conf == b.conf) { return a.bbox[0] < b.bbox[0]; } return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4])) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; res.push_back(det); } } } void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch) { const std::vector> skeleton_pairs = { {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); for (int k = 0; k < kNumberOfPoints * 3; k += 3) { if (res[j].keypoints[k + 2] > 0.5) { cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, cv::Scalar(0, 0x27, 0xC1), -1); } } for (const auto& bone : skeleton_pairs) { int kp1_idx = bone.first * 3; int kp2_idx = bone.second * 3; if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); } } } } } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; det.angle = decode_ptr_host[basic_pos + 7]; res.push_back(det); } } } void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } std::tuple convariance_matrix(Detection res) { float w = res.bbox[2]; float h = res.bbox[3]; float a = w * w / 12.0; float b = h * h / 12.0; float c = res.angle; float cos_r = std::cos(c); float sin_r = std::sin(c); float cos_r2 = cos_r * cos_r; float sin_r2 = sin_r * sin_r; float a_val = a * cos_r2 + b * sin_r2; float b_val = a * sin_r2 + b * cos_r2; float c_val = (a - b) * cos_r * sin_r; return std::make_tuple(a_val, b_val, c_val); } static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; std::tuple matrix1 = {a1, b1, c1}; std::tuple matrix2 = {a2, b2, c2}; matrix1 = convariance_matrix(res1); matrix2 = convariance_matrix(res2); a1 = std::get<0>(matrix1); b1 = std::get<1>(matrix1); c1 = std::get<2>(matrix1); a2 = std::get<0>(matrix2); b2 = std::get<1>(matrix2); c2 = std::get<2>(matrix2); float x1 = res1.bbox[0], y1 = res1.bbox[1]; float x2 = res2.bbox[0], y2 = res2.bbox[1]; float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t3 = std::log( ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) / (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = std::max(std::min(bd, 100.0f), eps); float hd = std::sqrt(1.0 - std::exp(-bd) + eps); return 1 - hd; } void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (probiou(item, dets[n]) >= nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms_obb(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } static std::vector get_corner(cv::Mat& img, const Detection& box) { float cos_value, sin_value; // Calculate center point and width/height float x1 = box.bbox[0]; float y1 = box.bbox[1]; float w = box.bbox[2]; float h = box.bbox[3]; float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees // Print original angle std::cout << "Original angle: " << angle << std::endl; // Swap width and height if height is greater than or equal to width if (h >= w) { std::swap(w, h); angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180) } // Ensure the angle is between 0 and 180 degrees if (angle < 0) { angle += 360.0f; // Convert to positive value } if (angle > 180.0f) { angle -= 180.0f; // Subtract 180 from angles greater than 180 } // Print adjusted angle std::cout << "Adjusted angle: " << angle << std::endl; // Convert to normal angle value float normal_angle = fmod(angle, 180.0f); if (normal_angle < 0) { normal_angle += 180.0f; // Ensure it's a positive value } // Print normal angle value std::cout << "Normal angle: " << normal_angle << std::endl; cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians sin_value = std::sin(angle * CV_PI / 180.0f); // Calculate each corner point float l = x1 - w / 2; // Left boundary float r = x1 + w / 2; // Right boundary float t = y1 - h / 2; // Top boundary float b = y1 + h / 2; // Bottom boundary // Use get_rect function to scale the coordinates float bbox[4] = {l, t, r, b}; cv::Rect rect = get_rect_obb(img, bbox); float x_ = (rect.x + rect.x + rect.width) / 2; // Center x float y_ = (rect.y + rect.y + rect.height) / 2; // Center y float width = rect.width; // Width float height = rect.height; // Height // Calculate each corner point std::vector corner_points(4); float vec1x = width / 2 * cos_value; float vec1y = width / 2 * sin_value; float vec2x = -height / 2 * sin_value; float vec2y = height / 2 * cos_value; corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner corner_points[2] = cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner // Check and adjust corner points to ensure the rectangle is parallel to image boundaries for (auto& point : corner_points) { point.x = std::max(0, std::min(point.x, img.cols - 1)); point.y = std::max(0, std::min(point.y, img.rows - 1)); } return corner_points; } void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; auto& img = img_batch[i]; for (auto& obj : res) { auto color = colors[(int)obj.class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); auto corner_points = get_corner(img, obj); cv::polylines(img, std::vector>{corner_points}, true, bgr, 1); auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf)); cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr); int width = textsize.width; int height = textsize.height; bool outside = (corner_points[0].y - height >= 3) ? true : false; cv::Point p1(corner_points[0].x, corner_points[0].y), p2; p2.x = corner_points[0].x + width; if (outside) { p2.y = corner_points[0].y - height - 3; } else { p2.y = corner_points[0].y + height + 3; } cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA); cv::putText( img, text, cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)), 0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA); } } } ================================================ FILE: yolo11/src/postprocess.cu ================================================ // // Created by lindsay on 23-7-17. // #include "postprocess.h" #include "types.h" static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; //[center_x center_y w h conf class_id mask[32] keypoints[51] angle] float cx = pitem[0]; float cy = pitem[1]; float width = pitem[2]; float height = pitem[3]; float label = pitem[5]; float angle = pitem[89]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = cx; *pout_item++ = cy; *pout_item++ = width; *pout_item++ = height; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore *pout_item++ = angle; } static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; float left = pitem[0]; float top = pitem[1]; float right = pitem[2]; float bottom = pitem[3]; float label = pitem[5]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) { float a_val = w * w / 12.0f; float b_val = h * h / 12.0f; float cos_r = cosf(r); float sin_r = sinf(r); a = a_val * cos_r * cos_r + b_val * sin_r * sin_r; b = a_val * sin_r * sin_r + b_val * cos_r * cos_r; c = (a_val - b_val) * sin_r * cos_r; } static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2, float h2, float r2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; convariance_matrix(w1, h1, r1, a1, b1, c1); convariance_matrix(w2, h2, r2, a2, b2, c2); float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) / (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = fmaxf(fminf(bd, 100.0f), eps); float hd = sqrtf(1.0f - expf(-bd) + eps); return 1 - hd; } static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1], pitem[2], pitem[3], pitem[7]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel<<>>(parray, max_objects, nms_threshold); } void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel_obb<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel_obb<<>>(parray, max_objects, nms_threshold); } ================================================ FILE: yolo11/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolo11/yolo11_cls.cpp ================================================ #include "calibrator.h" #include "config.h" #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "utils.h" #include #include #include #include #include using namespace nvinfer1; static Logger gLogger; const static int kOutputSize = kClsNumClass; void batch_preprocess(std::vector& imgs, float* output, int dst_width = 224, int dst_height = 224) { for (size_t b = 0; b < imgs.size(); b++) { int h = imgs[b].rows; int w = imgs[b].cols; int m = std::min(h, w); int top = (h - m) / 2; int left = (w - m) / 2; cv::Mat img = imgs[b](cv::Rect(left, top, m, m)); cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR); cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img.convertTo(img, CV_32F, 1 / 255.0); std::vector channels(3); cv::split(img, channels); // CHW format for (int c = 0; c < 3; ++c) { int i = 0; for (int row = 0; row < dst_height; ++row) { for (int col = 0; col < dst_width; ++col) { output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] = channels[c].at(row, col); ++i; } } } } } std::vector softmax(float* prob, int n) { std::vector res; float sum = 0.0f; float t; for (int i = 0; i < n; i++) { t = expf(prob[i]); res.push_back(t); sum += t; } for (int i = 0; i < n; i++) { res[i] /= sum; } return res; } std::vector topk(const std::vector& vec, int k) { std::vector topk_index; std::vector vec_index(vec.size()); std::iota(vec_index.begin(), vec_index.end(), 0); std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; }); int k_num = std::min(vec.size(), k); for (int i = 0; i < k_num; ++i) { topk_index.push_back(vec_index[i]); } return topk_index; } std::vector read_classes(std::string file_name) { std::vector classes; std::ifstream ifs(file_name, std::ios::in); if (!ifs.is_open()) { std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl; assert(0); } std::string s; while (std::getline(ifs, s)) { classes.push_back(s); } ifs.close(); return classes; } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir, std::string& type, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto net = std::string(argv[4]); if (net[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (net[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (net[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (net[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (net[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW]; *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void serialize_engine(float& gd, float& gw, std::string& wts_name, std::string& engine_name, std::string& type, int max_channels) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine IHostMemory* serialized_engine = nullptr; //engine = buildEngineYolo11Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); serialized_engine = buildEngineYolo11Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw, type, max_channels); assert(serialized_engine); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { // yolo11_cls -s ../models/yolo11n-cls.wts ../models/yolo11n-cls.fp32.trt n // yolo11_cls -d ../models/yolo11n-cls.fp32.trt ../images cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; float gd = 0.0f, gw = 0.0f; std::string img_dir; std::string type; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir, type, max_channels)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolo11_cls -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolo11_cls -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(gd, gw, wts_name, engine_name, type, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Prepare cpu and gpu buffers float* device_buffers[2]; float* cpu_input_buffer = nullptr; float* output_buffer_host = nullptr; prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // Read imagenet labels auto classes = read_classes("imagenet_classes.txt"); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess batch_preprocess(img_batch, cpu_input_buffer); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // Postprocess and get top-k result for (size_t b = 0; b < img_name_batch.size(); b++) { float* p = &output_buffer_host[b * kOutputSize]; auto res = softmax(p, kOutputSize); auto topk_idx = topk(res, 3); std::cout << img_name_batch[b] << std::endl; for (auto idx : topk_idx) { std::cout << " " << classes[idx] << " " << res[idx] << std::endl; } } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] cpu_input_buffer; delete[] output_buffer_host; // Destroy the engine delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolo11/yolo11_cls_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import os import shutil import sys import threading import time import cv2 import numpy as np import torch import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret with open("imagenet_classes.txt") as f: classes = [line.strip() for line in f.readlines()] class YoLo11TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] self.mean = (0.485, 0.456, 0.406) self.std = (0.229, 0.224, 0.225) for binding in engine: print('binding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape( binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_input_image = np.empty( shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): batch_image_raw.append(image_raw) input_image = self.preprocess_cls_image(image_raw) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls( output) cv2.putText(batch_image_raw[i], str( classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) print(classes_ls, predicted_conf_ls) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224): """ description: Convert BGR image to RGB, crop the center square frame, resize it to target size, normalize to [0,1], transform to NCHW format. param: raw_bgr_image: numpy array, raw BGR image dst_width: int, target image width dst_height: int, target image height return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape # Crop the center square frame m = min(h, w) top = (h - m) // 2 left = (w - m) // 2 image = raw_bgr_image[top:top + m, left:left + m] # Resize the image with target size while maintaining ratio image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR) # Convert BGR to RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Normalize to [0,1] image = image.astype(np.float32) / 255.0 # HWC to CHW format image = image.transpose(2, 0, 1) # CHW to NCHW format (add batch dimension) image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order" image = np.ascontiguousarray(image) batch_data = np.expand_dims(image, axis=0) return batch_data def postprocess_cls(self, output_data): classes_ls = [] predicted_conf_ls = [] category_id_ls = [] output_data = output_data.reshape(self.batch_size, -1) output_data = torch.Tensor(output_data) p = torch.nn.functional.softmax(output_data, dim=1) score, index = torch.topk(p, 3) for ind in range(index.shape[0]): input_category_id = index[ind][0].item() # 716 category_id_ls.append(input_category_id) predicted_confidence = score[ind][0].item() predicted_conf_ls.append(predicted_confidence) classes_ls.append(classes[input_category_id]) return classes_ls, predicted_conf_ls, category_id_ls class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer( self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format( self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer( self.yolo11_wrapper.get_raw_image_zeros()) print( 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine engine_file_path = "./yolo11x-cls-fp32.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolo11_wrapper = YoLo11TRT(engine_file_path) try: print('batch size is', yolo11_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches( yolo11_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolo11_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolo11_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolo11_wrapper.destroy() ================================================ FILE: yolo11/yolo11_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolo11Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { // yolo11_det -s ../models/yolo11n.wts ../models/yolo11n.fp32.trt n // yolo11_det -d ../models/yolo11n.fp32.trt ../images c cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string cuda_post_process; std::string type; int model_bboxes; float gd = 0, gw = 0; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo11_det -s [.wts] [.engine] [n/s/m/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolo11_det -d [.engine] ../images [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); // 保存output_buffer_host的前100个值,一行一个 // std::ofstream out("../models/output.txt"); // for (int j = 0; j < 100; j++) { // out << output_buffer_host[j] << std::endl; // } // out.close(); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolo11/yolo11_det_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLo11 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLo11TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray # pred = np.reshape(output[1:], (-1, 38))[:num, :] pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "yolo11s.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolo11_wrapper = YoLo11TRT(engine_file_path) try: print('batch size is', yolo11_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolo11_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolo11_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolo11_wrapper.destroy() ================================================ FILE: yolo11/yolo11_obb.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolo11Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kObbInputH * kObbInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); std::string sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { // yolo11_obb -s ../models/yolo11n-obb.wts ../models/yolo11n-obb.fp32.trt n // yolo11_obb -d ../models/yolo11n-obb.fp32.trt ../images c cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; std::string cuda_post_process; int model_bboxes; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo11_obb -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolo11_obb -d [.engine] ../images [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kObbInputW, kObbInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results // batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); // todo seg in gpu std::cerr << "obb_postprocess is not support in gpu right now" << std::endl; } // Draw bounding boxes draw_bbox_obb(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} // std::cout << std::endl; return 0; } ================================================ FILE: yolo11/yolo11_obb_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import sys import threading import time import cv2 import math import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 INPUT_W = 640 INPUT_H = 640 class Detection: def __init__(self, bbox, score, class_id, angle): self.bbox = bbox self.score = score self.class_id = class_id self.angle = angle def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def get_corner(img, box: Detection): """ description: Get the four corner points of the rotated bounding box param: img: an opencv image object (numpy array) box: a Detection object containing bbox [cx,cy,w,h] and angle (radians) return: corners: four corner points of the rotated bounding box as numpy array [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] """ # Extract box parameters cx, cy, w, h = box.bbox angle = box.angle * 180.0 / math.pi # Convert radians to degrees # Swap width and height if height >= width if h >= w: w, h = h, w angle = (angle + 90.0) % 180.0 # Adjust angle # Ensure angle is between 0 and 180 degrees if angle < 0: angle += 360.0 if angle > 180.0: angle -= 180.0 # Convert to normalized angle (0-180) normal_angle = angle % 180.0 if normal_angle < 0: normal_angle += 180.0 # Convert back to radians for calculation angle_rad = angle * math.pi / 180.0 cos_val = math.cos(angle_rad) sin_val = math.sin(angle_rad) # Calculate boundaries l_x = cx - w / 2 r_x = cx + w / 2 t_y = cy - h / 2 b_y = cy + h / 2 # Scale coordinates using get_rect_obb (matching C++ version) bbox = [l_x, t_y, r_x, b_y] rect = get_rect_obb(img, bbox) # Calculate center and dimensions of scaled box x_ = (rect[0] + rect[0] + rect[2]) / 2 # rect.x + rect.width/2 y_ = (rect[1] + rect[1] + rect[3]) / 2 # rect.y + rect.height/2 width = rect[2] height = rect[3] # Calculate vectors vec1x = width / 2 * cos_val vec1y = width / 2 * sin_val vec2x = -height / 2 * sin_val vec2y = height / 2 * cos_val # Calculate four corners corners = np.array([ [int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))], # Top-left [int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))], # Top-right [int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))], # Bottom-right [int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))] # Bottom-left ], dtype=np.int32) # Clip to image boundaries h, w = img.shape[:2] corners[:, 0] = np.clip(corners[:, 0], 0, w - 1) corners[:, 1] = np.clip(corners[:, 1], 0, h - 1) return corners def get_rect_obb(img, bbox): """ Scale coordinates according to image resize ratio (matching C++ version) param: img: OpenCV image (numpy array) bbox: [left, top, right, bottom] return: [x, y, width, height] """ l_x, t_y, r_x, b_y = bbox r_w = INPUT_W / img.shape[1] # INPUT_W should be your model input width r_h = INPUT_H / img.shape[0] # INPUT_H should be your model input height if r_h > r_w: l_x = l_x r_x = r_x t_y = t_y - (INPUT_H - r_w * img.shape[0]) / 2 b_y = b_y - (INPUT_H - r_w * img.shape[0]) / 2 l_x = l_x / r_w r_x = r_x / r_w t_y = t_y / r_w b_y = b_y / r_w else: l_x = l_x - (INPUT_W - r_h * img.shape[1]) / 2 r_x = r_x - (INPUT_W - r_h * img.shape[1]) / 2 t_y = t_y b_y = b_y l_x = l_x / r_h r_x = r_x / r_h t_y = t_y / r_h b_y = b_y / r_h l_x = max(0.0, l_x) t_y = max(0.0, t_y) width = max(0, min(int(round(r_x - l_x)), img.shape[1] - int(round(l_x)))) height = max(0, min(int(round(b_y - t_y)), img.shape[0] - int(round(t_y)))) return [int(round(l_x)), int(round(t_y)), width, height] def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one rotated bounding box on image img param: x: a box in [cx, cy, w, h, angle] format img: an opencv image object color: color to draw rectangle label: str line_thickness: int """ tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # Get four corner points corners = get_corner(img, x) corners = corners.astype(int) # Draw the rotated rectangle cv2.polylines(img, [corners], isClosed=True, color=color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness # Use first corner point for label placement p1 = tuple(corners[0]) w, h = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] outside = p1[1] - h >= 3 p2 = (p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3) cv2.rectangle(img, p1, p2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA ) class YoLo11TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): global INPUT_W, INPUT_H self.input_w = engine.get_binding_shape(binding)[-1] INPUT_W = self.input_w self.input_h = engine.get_binding_shape(binding)[-2] INPUT_H = self.input_h host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): keep = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(keep)): box = keep[j] # type: Detection np.random.seed(int(keep[j].class_id)) color = [np.random.randint(0, 255) for _ in range(3)] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(keep[j].class_id)], keep[j].score ), color=color, line_thickness=1 ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def covariance_matrix(self, res: Detection): """ description: Generating covariance matrix from obbs. param: box (np.ndarray): A numpy array representing rotated bounding box, with xywhr format. return: tuple: (a, b, c) values of covariance matrix """ w = res.bbox[2] h = res.bbox[3] angle = res.angle a = w * w / 12.0 b = h * h / 12.0 c = angle cos_r = math.cos(c) sin_r = math.sin(c) cos_r2 = cos_r * cos_r sin_r2 = sin_r * sin_r a_val = a * cos_r2 + b * sin_r2 b_val = a * sin_r2 + b * cos_r2 c_val = (a - b) * cos_r * sin_r return a_val, b_val, c_val def probiou(self, box1: Detection, box2: Detection, eps=1e-7): """ description: Calculate the prob IoU between oriented bounding boxes. param: box1 (np.ndarray): First box in xywhr format box2 (np.ndarray): Second box in xywhr format eps (float): Small value to avoid division by zero return: float: 1 - hd where hd is the Bhattacharyya distance """ a1, b1, c1 = self.covariance_matrix(box1) a2, b2, c2 = self.covariance_matrix(box2) x1, y1 = box1.bbox[0], box1.bbox[1] x2, y2 = box2.bbox[0], box2.bbox[1] t1 = ((a1 + a2) * (y1 - y2) ** 2 + (b1 + b2) * (x1 - x2) ** 2) / \ ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps) t1 *= 0.25 t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / \ ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps) t2 *= 0.5 t3 = ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2) / \ (4 * math.sqrt(max(a1 * b1 - c1 * c1, 0.0)) * math.sqrt(max(a2 * b2 - c2 * c2, 0.0)) + eps) t3 = math.log(t3 + eps) * 0.5 bd = max(min(t1 + t2 + t3, 100.0), eps) hd = math.sqrt(1.0 - math.exp(-bd) + eps) return 1 - hd def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id,angle cx,cy,w,h,conf,cls_id,angle ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2, angle] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Filter by confidence threshold mask = pred[:, 4] >= CONF_THRESH pred = pred[mask] if len(pred) == 0: return [] m_map = {} for i in range(len(pred)): class_id = int(pred[i][5]) if class_id not in m_map: m_map[class_id] = [] m_map[class_id].append(Detection(pred[i][:4], pred[i][4], class_id, pred[i][89])) res = [] for it in m_map: dets = m_map[it] dets = sorted(dets, key=lambda x: x.score, reverse=True) for m in range(len(dets)): if dets[m].score == 0.0: continue item = dets[m] res.append(item) for n in range(m + 1, len(dets)): if dets[n].score == 0.0: continue if self.probiou(item, dets[n]) > IOU_THRESHOLD: dets[n].score = 0.0 keep = [] for i in range(len(res)): if res[i].score > CONF_THRESH: keep.append(res[i]) return keep class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolo11n-obb.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load DOTAV 1.5 labels categories = ["plane", "ship", "storage tank", "baseball diamond", "tennis court", "basketball court", "ground track field", "harbor", "bridge", "large vehicle", "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool", "container crane"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolo11_wrapper = YoLo11TRT(engine_file_path) try: print('batch size is', yolo11_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolo11_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolo11_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolo11_wrapper.destroy() ================================================ FILE: yolo11/yolo11_pose.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolo11Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { // yolo11_pose -s ../models/yolo11n-pose.wts ../models/yolo11n-pose.fp32.trt n // yolo11_pose -d ../models/yolo11n-pose.fp32.trt ../images c cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; std::string cuda_post_process; int model_bboxes; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo11_pose -s [.wts] [.engine] [n/s/m/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolo11_pose -d [.engine] ../images [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { // Process gpu decode and nms results // todo pose in gpu std::cerr << "pose_postprocess is not support in gpu right now" << std::endl; } // Draw bounding boxes draw_bbox_keypoints_line(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolo11/yolo11_pose_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 keypoint_pairs = [ (0, 1), (0, 2), (0, 5), (0, 6), (1, 2), (1, 3), (2, 4), (5, 6), (5, 7), (5, 11), (6, 8), (6, 12), (7, 9), (8, 10), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16) ] def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLo11 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLo11TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.det_output_size = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid, keypoints = self.post_process( output[i * (self.det_output_size): (i + 1) * (self.det_output_size)], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) num_keypoints = len(keypoints[j]) // 3 points = [] for k in range(num_keypoints): x = keypoints[j][k * 3] y = keypoints[j][k * 3 + 1] confidence = keypoints[j][k * 3 + 2] if confidence > 0: points.append((int(x), int(y))) else: points.append(None) # 根据关键点索引对绘制线条 for pair in keypoint_pairs: partA, partB = pair if points[partA] and points[partB]: cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints): n = len(boxes) box_array = np.zeros_like(boxes) keypoint_array = np.zeros_like(keypoints) r_w = self.input_w / origin_w r_h = self.input_h / origin_h for i in range(n): if r_h > r_w: box = boxes[i] lmk = keypoints[i] box_array[i, 0] = box[0] / r_w box_array[i, 2] = box[2] / r_w box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w for j in range(0, len(lmk), 3): keypoint_array[i, j] = lmk[j] / r_w keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w keypoint_array[i, j + 2] = lmk[j + 2] else: box = boxes[i] lmk = keypoints[i] box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h box_array[i, 1] = box[1] / r_h box_array[i, 3] = box[3] / r_h for j in range(0, len(lmk), 3): keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h keypoint_array[i, j + 1] = lmk[j + 1] / r_h keypoint_array[i, j + 2] = lmk[j + 2] return box_array, keypoint_array def post_process(self, output, origin_h, origin_w): """ description: Post-process the prediction to include pose keypoints param: output: A numpy array like [num_boxes, cx, cy, w, h, conf, cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint origin_h: Height of original image origin_w: Width of original image return: result_boxes: Final boxes, a numpy array, each row is a box [x1, y1, x2, y2] result_scores: Final scores, a numpy array, each element is the score corresponding to box result_classid: Final classID, a numpy array, each element is the classid corresponding to box result_keypoints: Final keypoints, a list of numpy arrays, each element represents keypoints for a box, shaped as (#keypoints, 3) """ # Number of values per detection: 38 base values + 17 keypoints * 3 values each + angle num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the number of boxes detected num = int(output[0]) # Reshape to a two-dimensional ndarray with the full detection shape pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Perform non-maximum suppression to filter the detections boxes = self.non_max_suppression( pred[:, :num_values_per_detection], origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) # Extract the bounding boxes, confidence scores, and class IDs result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) result_keypoints = boxes[:, -POSE_NUM - 1:-1] if len(boxes) else np.array([]) # Return the post-processed results including keypoints return result_boxes, result_scores, result_classid, result_keypoints def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip( inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] res_array = np.copy(boxes) box_pred_deep_copy = np.copy(boxes[:, :4]) keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM - 1:-1]) res_box, res_keypoints = self.xywh2xyxy_with_keypoints( origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy) res_array[:, :4] = res_box res_array[:, -POSE_NUM - 1:-1] = res_keypoints # clip the coordinates res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1) res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1) res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1) res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1) # Object confidence confs = res_array[:, 4] # Sort by the confs res_array = res_array[np.argsort(-confs)] # Perform non-maximum suppression keep_res_array = [] while res_array.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres label_match = res_array[0, 5] == res_array[:, 5] invalid = large_overlap & label_match keep_res_array.append(res_array[0]) res_array = res_array[~invalid] res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([]) return res_array class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolo11n-pose.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolo11_wrapper = YoLo11TRT(engine_file_path) try: print('batch size is', yolo11_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolo11_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolo11_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolo11_wrapper.destroy() ================================================ FILE: yolo11/yolo11_seg.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4); static cv::Rect get_downscale_rect(float bbox[4], float scale) { float left = bbox[0]; float top = bbox[1]; float right = bbox[0] + bbox[2]; float bottom = bbox[1] + bbox[3]; left = left < 0 ? 0 : left; top = top < 0 ? 0 : top; right = right > kInputW ? kInputW : right; bottom = bottom > kInputH ? kInputH : bottom; left /= scale; top /= scale; right /= scale; bottom /= scale; return cv::Rect(int(left), int(top), int(right - left), int(bottom - top)); } std::vector process_mask(const float* proto, int proto_size, std::vector& dets) { std::vector masks; for (size_t i = 0; i < dets.size(); i++) { cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); auto r = get_downscale_rect(dets[i].bbox, 4); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float e = 0.0f; for (int j = 0; j < 32; j++) { e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; } e = 1.0f / (1.0f + expf(-e)); mask_mat.at(y, x) = e; } } cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); masks.push_back(mask_mat); } return masks; } void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolo11Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 3); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); const int outputIndex_seg = engine->getBindingIndex(kProtoTensorName); assert(inputIndex == 0); assert(outputIndex == 1); assert(outputIndex_seg == 2); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { std::cout << "kOutputSize:" << kOutputSize << std::endl; CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl; CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, std::string& labels_filename, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); std::string sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 6) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); labels_filename = std::string(argv[5]); } else { return false; } return true; } int main(int argc, char** argv) { // yolo11_seg -s ../models/yolo11n-seg.wts ../models/yolo11n-seg.fp32.trt n // yolo11_seg -d ../models/yolo11n-seg.fp32.trt ../images c coco.txt cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; std::string cuda_post_process; std::string labels_filename = "coco.txt"; int model_bboxes; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, labels_filename, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo11_seg -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolo11_seg -d [.engine] ../images [c/g] coco_file// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[3]; float* output_buffer_host = nullptr; float* output_seg_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } std::unordered_map labels_map; read_labels(labels_filename, labels_map); assert(kNumClass == labels_map.size()); prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); for (size_t b = 0; b < img_batch.size(); b++) { auto& res = res_batch[b]; cv::Mat img = img_batch[b]; auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res); draw_mask_bbox(img, res, masks, labels_map); cv::imwrite("_" + img_name_batch[b], img); } } else if (cuda_post_process == "g") { // Process gpu decode and nms results // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); // todo seg in gpu std::cerr << "seg_postprocess is not support in gpu right now" << std::endl; } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(device_buffers[2])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; delete[] output_seg_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} // std::cout << std::endl; return 0; } ================================================ FILE: yolo11/yolo11_seg_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLo11 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLo11TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings # Data length self.det_output_length = host_outputs[0].shape[0] self.seg_output_length = host_outputs[1].shape[0] self.seg_w = int(self.input_w / 4) self.seg_h = int(self.input_h / 4) self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w)) self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM # Draw mask self.colors_obj = Colors() def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] output_proto_mask = host_outputs[1] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid, result_proto_coef = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) if result_proto_coef.shape[0] == 0: continue result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i], batch_origin_w[i]) self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid], im_src=batch_image_raw[i]) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid, result_proto_coef def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, 5] == boxes[:, 5] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def scale_mask(self, mask, ih, iw): mask = cv2.resize(mask, (self.input_w, self.input_h)) r_w = self.input_w / (iw * 1.0) r_h = self.input_h / (ih * 1.0) if r_h > r_w: w = self.input_w h = int(r_w * ih) x = 0 y = int((self.input_h - h) / 2) else: w = int(r_h * iw) h = self.input_h x = int((self.input_w - w) / 2) y = 0 crop = mask[y:y + h, x:x + w] crop = cv2.resize(crop, (iw, ih)) return crop def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw): """ description: Mask pred by yolo11 instance segmentation , param: output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input result_proto_coef: prototype mask coefficients (n, 32), n represents n results result_boxes : ih: rows of original image iw: cols of original image return: mask_result: (n, ih, iw) """ result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w) c, mh, mw = result_proto_masks.shape print(result_proto_masks.shape) print(result_proto_coef.shape) masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw) mask_result = [] for mask, box in zip(masks, result_boxes): mask_s = np.zeros((ih, iw)) crop_mask = self.scale_mask(mask, ih, iw) x1 = int(box[0]) y1 = int(box[1]) x2 = int(box[2]) y2 = int(box[3]) crop = crop_mask[y1:y2, x1:x2] crop = np.where(crop >= 0.5, 1, 0) crop = crop.astype(np.uint8) mask_s[y1:y2, x1:x2] = crop mask_result.append(mask_s) mask_result = np.array(mask_result) return mask_result def draw_mask(self, masks, colors_, im_src, alpha=0.5): """ description: Draw mask on image , param: masks : result_mask colors_: color to draw mask im_src : original image alpha : scale between original image and mask return: no return """ if len(masks) == 0: return masks = np.asarray(masks, dtype=np.uint8) masks = np.ascontiguousarray(masks.transpose(1, 2, 0)) masks = np.asarray(masks, dtype=np.float32) colors_ = np.asarray(colors_, dtype=np.float32) s = masks.sum(2, keepdims=True).clip(0, 1) masks = (masks @ colors_).clip(0, 255) im_src[:] = masks * alpha + im_src * (1 - s * alpha) class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) class Colors: def __init__(self): hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7') self.palette = [self.hex2rgb(f'#{c}') for c in hexs] self.n = len(self.palette) def __call__(self, i, bgr=False): c = self.palette[int(i) % self.n] return (c[2], c[1], c[0]) if bgr else c @staticmethod def hex2rgb(h): # rgb order (PIL) return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "yolo11n-seg.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolo11_wrapper = YoLo11TRT(engine_file_path) try: print('batch size is', yolo11_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolo11_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolo11_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolo11_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolo11_wrapper.destroy() ================================================ FILE: yolo11_tripy/.gitignore ================================================ imagenet_classes.txt *.JPEG *.pt ================================================ FILE: yolo11_tripy/README.md ================================================ # YOLO11 Tripy This example implements a YOLO11 classifier model using [Tripy](https://nvidia.github.io/TensorRT-Incubator/). ## Running The Example Run the following commands from the [`yolo11_tripy`](./) directory: 1. Install Dependencies: ```bash python3 -m pip install -r requirements.txt ``` 2. Download ImageNet classes file: ```bash wget https://raw.githubusercontent.com/joannzhang00/ImageNet-dataset-classes-labels/main/imagenet_classes.txt ``` 3. [*Optional*] Download some images: ```bash wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n01558993_robin.JPEG wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n04389033_tank.JPEG ``` You can skip this step if you already have images you'd like to classify. 3. Build the model: ```bash python3 compile_classifier.py ``` You can configure various aspects of the model when you compile. Run `python3 compile_classifier.py -h` for details. 4. Run inference: ```bash python3 classify.py n01558993_robin.JPEG n04389033_tank.JPEG ``` The `classify.py` script allows you to pass one or more image file paths on the command line. The images are batched and classified in a single forward pass. ================================================ FILE: yolo11_tripy/classify.py ================================================ import argparse import os import cv2 import numpy as np import nvtripy as tp import time from constants import IMAGE_H, IMAGE_W CURDIR = os.path.realpath(os.path.dirname(__file__)) def load_image(path): return cv2.imread(path) def preprocess(image): h, w, _ = image.shape # Crop the center square frame m = min(h, w) top = (h - m) // 2 left = (w - m) // 2 image = image[top:top + m, left:left + m] # Resize the image with target size while maintaining ratio image = cv2.resize(image, (IMAGE_H, IMAGE_W), interpolation=cv2.INTER_LINEAR) # Convert BGR to RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Normalize to [0,1] image = image.astype(np.float32) / 255.0 # HWC to CHW format image = image.transpose(2, 0, 1) # CHW to NCHW format (add batch dimension) image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order" image = np.ascontiguousarray(image) return image def main(): parser = argparse.ArgumentParser(description="Classify an image using a YOLO11 classifier model.") parser.add_argument("images", help="Images to classify", nargs="+") parser.add_argument( "--model-path", help="Path to the compiled model", default=os.path.join(CURDIR, "yolo11-cls.tpymodel"), ) parser.add_argument( "--imagenet-classes-file", help="Path to the ImageNet classes file (imagenet_classes.txt)", default=os.path.join(CURDIR, "imagenet_classes.txt"), ) args, _ = parser.parse_known_args() with open(args.imagenet_classes_file) as f: CLASSES = [line.strip() for line in f.readlines()] print(f"Loading model: {args.model_path}...") model = tp.Executable.load(args.model_path) input_info = model.input_infos["batch"] dtype = input_info.dtype if input_info.shape_bounds.max[0] < len(args.images): raise ValueError( f"Model was compiled for a maximum of {input_info.shape_bounds.max[0]} image(s) " f"per batch, but {len(args.images)} were provided." f"\nPlease recompile the model with a larger maximum batch size using the " f"`--max-images` argument in `compile_classifier.py`." ) images = [preprocess(load_image(path)) for path in args.images] batch = tp.Tensor(np.concatenate(images, axis=0)) # Warm up the model: _, _ = model(tp.zeros_like(batch, dtype=dtype).eval()) # Cast the input based on the model type. # Note that the result will be in GPU memory, so we don't need an explicit copy. batch = tp.cast(batch, dtype).eval() start = time.perf_counter() batch_scores, batch_preds = model(batch) end = time.perf_counter() print(f"Inference + Postprocessing took: {(end - start) * 1000:.3f} ms") # Copy the scores back to CPU memory and convert to numpy: batch_scores = np.from_dlpack(tp.copy(batch_scores, device=tp.device("cpu"))) batch_preds = np.from_dlpack(tp.copy(batch_preds, device=tp.device("cpu"))) for path, scores, preds in zip(args.images, batch_scores, batch_preds): print(f"Top {len(preds)} predictions for:", path) for idx, (score, pred) in enumerate(zip(scores, preds)): print(f" {idx + 1}. (confidence: {score:.3f}) {CLASSES[pred]}") print() if __name__ == "__main__": main() ================================================ FILE: yolo11_tripy/compile_classifier.py ================================================ import argparse import os import nvtripy as tp import requests import torch from constants import IMAGE_C, IMAGE_H, IMAGE_W from model.model import Yolo11Cls from tqdm import tqdm CURDIR = os.path.realpath(os.path.dirname(__file__)) def get_model_config(model_variant): config = { "model_variant": model_variant, } if model_variant == "n": config.update({"gd": 0.50, "gw": 0.25, "max_channels": 1024}) elif model_variant == "s": config.update({"gd": 0.50, "gw": 0.50, "max_channels": 1024}) elif model_variant == "m": config.update({"gd": 0.50, "gw": 1.00, "max_channels": 512}) elif model_variant == "l": config.update({"gd": 1.0, "gw": 1.0, "max_channels": 512}) elif model_variant == "x": config.update({"gd": 1.0, "gw": 1.50, "max_channels": 512}) return config def download_weights(model_variant, directory): out_path = os.path.join(directory, f"yolo11{model_variant}-cls.pt") if os.path.exists(out_path): print(f"Checkpoint already exists at: {out_path}, skipping download.") return out_path URL = f"https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11{model_variant}-cls.pt" response = requests.get(URL, stream=True) response.raise_for_status() total_size = int(response.headers.get("content-length", 0)) os.makedirs(directory, exist_ok=True) with open(out_path, "wb") as f, tqdm( desc=f"Downloading checkpoint: yolo11{model_variant}-cls.pt", total=total_size, unit="B", unit_scale=True, unit_divisor=1024, ) as progress_bar: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) progress_bar.update(len(chunk)) return out_path def load_weights(weights_path, dtype): checkpoint = torch.load(weights_path, weights_only=False) torch_model = checkpoint["model"].eval() if dtype == tp.float16: torch_model = torch_model.half() else: assert dtype == tp.float32, "Unsupported dtype" torch_model = torch_model.float() state_dict = torch_model.state_dict() # Some weights from the training graph are not needed for inference: def should_include(key): return "num_batches_tracked" not in key return {name: tp.Tensor(weight) for name, weight in state_dict.items() if should_include(name)} def main(): parser = argparse.ArgumentParser(description="Compiles a YOLO11 classifier model with Tripy.") parser.add_argument( "--model-variant", help="Model variant (n, s, m, l, x)", default="n", choices=["n", "s", "m", "l", "x"], ) parser.add_argument( "-o", "--output", help="Where to save the Tripy executable", default="yolo11-cls.tpymodel", ) parser.add_argument( "--checkpoints-dir", help="Where to save PyTorch checkpoints", default=os.path.join(CURDIR, "checkpoints"), ) parser.add_argument( "--max-images", help="Maximum number of images the model will be able to classify at once, i.e. the maximum batch size.", default=10, type=int, ) parser.add_argument( "--dtype", help="Data type to use for inference", default="float16", choices=["float32", "float16"], ) args, _ = parser.parse_known_args() config = get_model_config(args.model_variant) dtype = getattr(tp, args.dtype) model = Yolo11Cls(**config, dtype=dtype) weights_path = download_weights(args.model_variant, args.checkpoints_dir) model.load_state_dict(load_weights(weights_path, dtype)) # We compile not only the classifier itself, but also accelerate the postprocessing: def infer(batch): out = model(batch) out = tp.softmax(out, dim=1) batch_scores, batch_preds = tp.topk(out, 3, dim=-1) return batch_scores, batch_preds print("Compiling YOLO11 classifier + postprocessing. This may take a few moments...") executable = tp.compile( infer, args=[ tp.InputInfo( [ # Support a range of batch sizes from 1 to `max_images`, optimizing for the midpoint: (1, (args.max_images + 1) // 2, args.max_images), IMAGE_C, IMAGE_H, IMAGE_W, ], dtype=dtype, ), ], ) print(f"Saving compiled executable to: {args.output}") executable.save(args.output) if __name__ == "__main__": main() ================================================ FILE: yolo11_tripy/constants.py ================================================ IMAGE_C = 3 IMAGE_H = 224 IMAGE_W = 224 ================================================ FILE: yolo11_tripy/model/block.py ================================================ import nvtripy as tp class ConvBnSilu(tp.Module): def __init__(self, in_channels, out_channels, kernel_dims, stride, dtype): super().__init__() self.conv = tp.Conv( in_channels, out_channels, kernel_dims, stride=stride, padding=[(dim // 2, dim // 2) for dim in kernel_dims], bias=False, dtype=dtype, ) self.bn = tp.BatchNorm(out_channels, eps=1e-3, dtype=dtype) def forward(self, x): x = self.conv(x) x = self.bn(x) x = tp.silu(x) return x class Bottleneck(tp.Module): def __init__( self, in_channels, out_channels, shortcut, kernel_dims1, kernel_dims2, expansion_ratio, dtype, ): super().__init__() expanded_out_channels = int(out_channels * expansion_ratio) self.cv1 = ConvBnSilu(in_channels, expanded_out_channels, kernel_dims1, stride=(1, 1), dtype=dtype) self.cv2 = ConvBnSilu( expanded_out_channels, out_channels, kernel_dims2, stride=(1, 1), dtype=dtype, ) self.shortcut = shortcut and in_channels == out_channels def forward(self, x): out = self.cv1(x) out = self.cv2(out) if self.shortcut: out += x return out class C3k(tp.Module): def __init__( self, in_channels, out_channels, num_layers, shortcut, kernel_dims1, kernel_dims2, expansion_ratio, dtype, ): super().__init__() expanded_out_channels = int(out_channels * expansion_ratio) self.cv1 = ConvBnSilu( in_channels, expanded_out_channels, kernel_dims=(1, 1), stride=(1, 1), dtype=dtype, ) self.cv2 = ConvBnSilu( in_channels, expanded_out_channels, kernel_dims=(1, 1), stride=(1, 1), dtype=dtype, ) self.m = tp.Sequential( *[ Bottleneck( expanded_out_channels, expanded_out_channels, shortcut, kernel_dims1, kernel_dims2, 1.0, dtype=dtype, ) for _ in range(num_layers) ] ) self.cv3 = ConvBnSilu( 2 * expanded_out_channels, out_channels, kernel_dims=(1, 1), stride=(1, 1), dtype=dtype, ) def forward(self, x): out1 = self.cv1(x) out2 = self.cv2(x) out1 = self.m(out1) out = tp.concatenate((out1, out2), dim=1) out = self.cv3(out) return out class C3K2(tp.Module): def __init__( self, in_channels, out_channels, num_layers, use_c3k, shortcut, expansion_ratio, dtype, ): super().__init__() expanded_out_channels = int(out_channels * expansion_ratio) self.cv1 = ConvBnSilu( in_channels, 2 * expanded_out_channels, kernel_dims=(1, 1), stride=(1, 1), dtype=dtype, ) self.m = tp.Sequential( *[ ( C3k( expanded_out_channels, expanded_out_channels, 2, shortcut, (3, 3), (3, 3), 0.5, dtype=dtype, ) if use_c3k else Bottleneck( expanded_out_channels, expanded_out_channels, shortcut, (3, 3), (3, 3), 0.5, dtype=dtype, ) ) for _ in range(num_layers) ] ) # Number of input channels to CV2 is the output channels of CV1 plus all # output channels from the layers in `m`. cv2_in_channels = (2 * expanded_out_channels) + (expanded_out_channels * num_layers) self.cv2 = ConvBnSilu(cv2_in_channels, out_channels, (1, 1), (1, 1), dtype=dtype) def forward(self, x): x = self.cv1(x) _, m_inp = tp.split(x, 2, dim=1) cat = x # We manually iterate over the Sequential module here since we need to access the intermediate outputs. for layer in self.m: m_inp = layer(m_inp) cat = tp.concatenate((cat, m_inp), dim=1) out = self.cv2(cat) return out class ConvBn(tp.Module): def __init__(self, in_channels, out_channels, kernel_dims, stride, dtype, num_groups=1): super().__init__() self.conv = tp.Conv( in_channels, out_channels, kernel_dims, stride=stride, padding=[(dim // 2, dim // 2) for dim in kernel_dims], bias=False, groups=num_groups, dtype=dtype, ) self.bn = tp.BatchNorm(out_channels, eps=1e-3, dtype=dtype) def forward(self, x): x = self.conv(x) x = self.bn(x) return x class Attention(tp.Module): def __init__(self, dim, num_heads, attn_ratio, dtype): super().__init__() self.dim = dim self.num_heads = num_heads head_dim = self.dim // num_heads self.key_dim = int(head_dim * attn_ratio) self.scale = self.key_dim**-0.5 nh_kd = self.key_dim * num_heads h = self.dim + nh_kd * 2 self.qkv = ConvBn(self.dim, h, (1, 1), (1, 1), dtype=dtype) self.pe = ConvBn(self.dim, self.dim, (3, 3), (1, 1), dtype=dtype, num_groups=self.dim) self.proj = ConvBn(self.dim, self.dim, (1, 1), (1, 1), dtype=dtype) def forward(self, x): B, _, H, W = x.shape N = H * W x = self.qkv(x) x = tp.reshape(x, (B, self.num_heads, -1, N)) q, k, v = tp.split(x, [self.key_dim, self.key_dim, self.key_dim * 2], dim=2) q_t = tp.transpose(q, 2, 3) softmax = tp.softmax((q_t @ k) * self.scale, dim=3) attn_t = tp.transpose(softmax, 2, 3) matmul2 = v @ attn_t reshape = tp.reshape(matmul2, (B, -1, H, W)) v_reshape = tp.reshape(v, (B, self.dim, H, W)) pe = self.pe(v_reshape) sum = reshape + pe proj = self.proj(sum) return proj class PSABlock(tp.Module): def __init__(self, dim, attn_ratio, num_heads, shortcut, dtype): super().__init__() self.attn = Attention(dim, num_heads, attn_ratio, dtype=dtype) self.shortcut = shortcut self.ffn = tp.Sequential( ConvBnSilu(dim, dim * 2, (1, 1), (1, 1), dtype=dtype), ConvBn(dim * 2, dim, (1, 1), (1, 1), dtype=dtype), ) def forward(self, x): attn_out = self.attn(x) if self.shortcut: x = x + attn_out else: x = attn_out ffn_out = self.ffn(x) if self.shortcut: x = x + ffn_out else: x = ffn_out return x class C2PSA(tp.Module): def __init__(self, input_channels, output_channels, num_layers, expansion_ratio, dtype): super().__init__() expanded_input_channels = int(input_channels * expansion_ratio) self.cv1 = ConvBnSilu(input_channels, 2 * expanded_input_channels, (1, 1), (1, 1), dtype=dtype) self.m = tp.Sequential( *[ PSABlock( expanded_input_channels, 0.5, expanded_input_channels // 64, True, dtype=dtype, ) for _ in range(num_layers) ] ) self.cv2 = ConvBnSilu(2 * expanded_input_channels, output_channels, (1, 1), (1, 1), dtype=dtype) def forward(self, x): x = self.cv1(x) split1, y = tp.split(x, 2, dim=1) y = self.m(y) cat = tp.concatenate((split1, y), dim=1) out = self.cv2(cat) return out ================================================ FILE: yolo11_tripy/model/model.py ================================================ import math import nvtripy as tp from .block import C2PSA, C3K2, ConvBnSilu NUM_CLASSES = 1000 def get_width(w, gw, max_channels, divisor=8): return int(math.ceil((min(w, max_channels) * gw) / divisor)) * divisor def get_depth(d, gd): if d == 1: return d r = round(d * gd) # Round ties for even numbers down: if d * gd - int(d * gd) == 0.5 and (int(d * gd) % 2) == 0: r -= 1 return max(r, 1) class Yolo11Head(tp.Module): def __init__(self, input_channels, dtype): super().__init__() self.conv = ConvBnSilu(input_channels, 1280, (1, 1), (1, 1), dtype=dtype) self.linear = tp.Linear(1280, NUM_CLASSES, dtype=dtype) def forward(self, x): x = self.conv(x) # Global average pooling: x = tp.reshape(tp.mean(x, dim=(2, 3), keepdim=True), (-1, 1280)) x = self.linear(x) return x class Yolo11Cls(tp.Module): def __init__(self, model_variant, gd, gw, max_channels, dtype=tp.float32): use_c3k = model_variant in {"m", "l", "x"} self.model = tp.Sequential( ConvBnSilu(3, get_width(64, gw, max_channels), (3, 3), (2, 2), dtype=dtype), ConvBnSilu( get_width(64, gw, max_channels), get_width(128, gw, max_channels), (3, 3), (2, 2), dtype=dtype, ), C3K2( get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), use_c3k, True, 0.25, dtype=dtype, ), ConvBnSilu( get_width(256, gw, max_channels), get_width(256, gw, max_channels), (3, 3), (2, 2), dtype=dtype, ), C3K2( get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), use_c3k, True, 0.25, dtype=dtype, ), ConvBnSilu( get_width(512, gw, max_channels), get_width(512, gw, max_channels), (3, 3), (2, 2), dtype=dtype, ), C3K2( get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), True, True, 0.5, dtype=dtype, ), ConvBnSilu( get_width(512, gw, max_channels), get_width(1024, gw, max_channels), (3, 3), (2, 2), dtype=dtype, ), C3K2( get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), True, True, 0.5, dtype=dtype, ), C2PSA( get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, dtype=dtype, ), Yolo11Head(get_width(1024, gw, max_channels), dtype=dtype), ) def forward(self, x): x = self.model(x) return x ================================================ FILE: yolo11_tripy/requirements.txt ================================================ -f https://nvidia.github.io/TensorRT-Incubator/packages.html nvtripy>=0.1.1 opencv-python-headless numpy torch ================================================ FILE: yolo26/.clang-format ================================================ # Google C/C++ Code Style settings (with 4-space) # Refered to https://github.com/kehanXue/google-style-clang-format/blob/master/.clang-format Language: Cpp BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: None AlignOperands: Align AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: Empty AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Inline AllowShortIfStatementsOnASingleLine: Never # To avoid conflict, set this "Never" and each "if statement" should include brace when coding AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BreakBeforeBraces: Custom BraceWrapping: AfterCaseLabel: false AfterClass: false AfterStruct: false AfterControlStatement: Never AfterEnum: false AfterFunction: false AfterNamespace: false AfterUnion: false AfterExternBlock: false BeforeCatch: false BeforeElse: false BeforeLambdaBody: false IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: None BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon ColumnLimit: 120 CompactNamespaces: false ContinuationIndentWidth: 8 Cpp11BracedListStyle: true DerivePointerAlignment: false # Make sure the * or & align on the left EmptyLineBeforeAccessModifier: LogicalBlock FixNamespaceComments: true IncludeBlocks: Preserve IndentCaseLabels: true IndentPPDirectives: None IndentWidth: 4 KeepEmptyLinesAtTheStartOfBlocks: true MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: true PointerAlignment: Left ReflowComments: false # SeparateDefinitionBlocks: Always # Only support since clang-format 14 SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 2 SpacesInAngles: false SpacesInCStyleCastParentheses: false SpacesInContainerLiterals: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: c++11 TabWidth: 8 UseTab: Never ================================================ FILE: yolo26/.gitignore ================================================ **/build/** **/models/** **/*.onnx **/*.engine **/*.pt ================================================ FILE: yolo26/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolo26) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/workspace/shared/TensorRT-8.6.3/include) link_directories(/workspace/shared/TensorRT-8.6.3/lib) endif() add_library(yololayerplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(yololayerplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolo26_det ${PROJECT_SOURCE_DIR}/yolo26_det.cpp ${SRCS}) target_link_libraries(yolo26_det nvinfer) target_link_libraries(yolo26_det cudart) target_link_libraries(yolo26_det yololayerplugins) target_link_libraries(yolo26_det ${OpenCV_LIBS}) add_executable(yolo26_obb ${PROJECT_SOURCE_DIR}/yolo26_obb.cpp ${SRCS}) target_link_libraries(yolo26_obb nvinfer) target_link_libraries(yolo26_obb cudart) target_link_libraries(yolo26_obb yololayerplugins) target_link_libraries(yolo26_obb ${OpenCV_LIBS}) add_executable(yolo26_cls ${PROJECT_SOURCE_DIR}/yolo26_cls.cpp ${SRCS}) target_link_libraries(yolo26_cls nvinfer) target_link_libraries(yolo26_cls cudart) target_link_libraries(yolo26_cls yololayerplugins) target_link_libraries(yolo26_cls ${OpenCV_LIBS}) ================================================ FILE: yolo26/README.md ================================================ ## Introduction Yolo26 model supports TensorRT-8. Training code [link](https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.4.0.zip) ## Environment * cuda 12.4 * cudnn 9.1.0.70 * tensorrt 8.6.3 * opencv 4.8.0 * ultralytics 8.4.0 ## Support * [✅] Yolo26n-det, Yolo26s-det, Yolo26m-det, Yolo26l-det, Yolo26sx-det, support FP32/FP16 and C++ API * [✅] Yolo26n-obb, Yolo26s-obb, Yolo26m-obb, Yolo26l-obb, Yolo26sx-obb, support FP32/FP16 and C++ API * [✅] Yolo26n-cls, Yolo26s-cls, Yolo26m-cls, Yolo26l-cls, Yolo26sx-cls, support FP32/FP16 and C++ API ## COMING FEATURES * [⏳] Windows OS Support * [⏳] Support Batched Inputs * [⏳] Support Quantization * [⏳] Yolo26-cls models * [⏳] Yolo26-pose models * [⏳] Yolo26-seg models ## Config * Choose the YOLO26 sub-model n/s/m/l/x from command line arguments. * Other configs please check [include/config.h](include/config.h) ## Build and Run 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # Download ultralytics wget https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.4.4.zip -O ultralytics-8.4.4.zip # Unzip ultralytics unzip ultralytics-8.4.4.zip cd ultralytics-8.4.4 # Download models For Detection wget https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n.pt -O yolo26n.pt # to download other models, replace 'yolo26n.pt' with 'yolo26s.pt', 'yolo26m.pt', 'yolo26l.pt' or 'yolo26x.pt' # Generate .wts cp [PATH-TO-MAIN-FOLDER]/gen_wts.py . python gen_wts.py -w yolo26n.pt -o yolo26n.wts -t detect # A file 'yolo26n.wts' will be generated. # Download models for Obb wget https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n-obb.pt -O yolo26n-obb.pt # to download other models, replace 'yolo26n-obb.pt' with 'yolo26s-obb.pt', 'yolo26m-obb.pt', 'yolo26l-obb.pt' or 'yolo26x-obb.pt' # Generate .wts cp [PATH-TO-MAIN-FOLDER]/gen_wts.py . python gen_wts.py -w yolo26n-obb.pt -o yolo26n-obb.wts -t obb # A file 'yolo26n-obb.wts' will be generated. # Download models for Cls wget https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n-cls.pt -O yolo26n-cls.pt # to download other models, replace 'yolo26n-cls.pt' with 'yolo26s-cls.pt', 'yolo26m-cls.pt', 'yolo26l-cls.pt' or 'yolo26x-cls.pt' # Generate .wts cp [PATH-TO-MAIN-FOLDER]/gen_wts.py . python gen_wts.py -w yolo26n-cls.pt -o yolo26n-cls.wts -t cls # A file 'yolo26n-cls.wts' will be generated. ``` 2. build and run ```shell cd [PATH-TO-MAIN-FOLDER] mkdir build cd build cmake .. make ``` ### Detection ```shell cp [PATH-TO-ultralytics]/yolo26n.wts . # Build and serialize TensorRT engine ./yolo26_det -s yolo26n.wts yolo26n.engine [n/s/m/l/x] # Run inference ./yolo26_det -d yolo26n.engine ../images # results saved in build directory ``` ### Obb ```shell cp [PATH-TO-ultralytics]/yolo26n-obb.wts . # Build and serialize TensorRT engine ./yolo26_obb -s yolo26n-obb.wts yolo26n-obb.engine [n/s/m/l/x] # Run inference ./yolo26_obb -d yolo26n-obb.engine ../images # results saved in build directory ``` ### Cls ```shell Generate classification text file in build folder or download it # wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt cp [PATH-TO-ultralytics]/yolo26n-cls.wts . # Build and serialize TensorRT engine ./yolo26_cls -s yolo26n-cls.wts yolo26n-cls.engine [n/s/m/l/x] # Run inference ./yolo26_cls -d yolo26n-cls.engine ../images # results saved in build directory ``` ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolo26/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32 if m_type in ['detect', 'seg', 'pose', 'obb']: anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] delattr(model.model[-1], 'anchors') model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolo26/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" using namespace std; std::map loadWeights(const std::string file); nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int g = 1); nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool c3k, bool shortcut, bool atnn, float e, std::string lname); nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, bool shortcut, std::string lname); nvinfer1::IElementWiseLayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname); nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); nvinfer1::ILayer* conv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int g = 1, bool act = true); nvinfer1::IPluginV2Layer* addYoloLayer(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input, const std::vector& strides, const std::vector& fm_sizes, int stridesLength, bool is_detection, bool is_segmentation, bool is_pose, bool is_obb, int anchorCount); ================================================ FILE: yolo26/include/config.h ================================================ #define USE_FP16 // #define USE_FP32 // #define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static char* kProtoTensorName = "proto"; const static int kNumClass = 80; const static int kPoseNumClass = 1; const static int kNumberOfPoints = 17; // number of keypoints total // obb model's number of classes constexpr static int kObbNumClass = 15; const static int kObbNe = 1; // number of extra parameters const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static int kObbInputH = 1024; const static int kObbInputW = 1024; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.3f; const static float kConfThreshKeypoints = 0.5f; // keypoints confidence const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 300; // Quantization input image folder path const static char* kInputQuantizationFolder = "./coco_calib"; // Classfication model's number of classes constexpr static int kClsNumClass = 1000; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; ================================================ FILE: yolo26/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolo26/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolo26/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolo26/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolo26Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolo26Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolo26Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); ================================================ FILE: yolo26/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" // Preprocessing functions cv::Rect get_rect(cv::Mat& img, float bbox[4]); // NMS functions void decode(std::vector& res, float* output); void batch_decode(std::vector>& res_batch, float* output, int batch_size, int output_size); void decode_obb(std::vector& res, float* output); void batch_decode_obb(std::vector>& batch_res, float* output, int batch_size, int output_size); // Drawing functions void draw_bbox(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); ================================================ FILE: yolo26/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolo26/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { // center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; float mask[32]; float keypoints[kNumberOfPoints * 3]; // 17*3 keypoints float angle; // obb angle }; struct AffineMatrix { float value[6]; }; const int bbox_element = sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolo26/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { // std::string cur_file_name(p_dir_name); // cur_file_name += "/"; // cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); // std::cout << "Found file: " << cur_file_name << std::endl; file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } inline std::vector read_classes(std::string file_name) { std::vector classes; std::ifstream ifs(file_name, std::ios::in); if (!ifs.is_open()) { std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl; assert(0); } std::string s; while (std::getline(ifs, s)) { // std::cout << "Read class: " << s << std::endl; classes.push_back(s); } ifs.close(); return classes; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } static inline bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } ================================================ FILE: yolo26/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" __device__ float d_confThreshold = 0.4f; namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { void setPluginDeviceParams(float confThreshold) { cudaMemcpyToSymbol(d_confThreshold, &confThreshold, sizeof(float)); } YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberOfPoints, int maxDetections, bool isDetection, bool isSegmentation, bool isPose, bool isObb, int anchor_count) { mClassCount = classCount; mNumberOfPoints = numberOfPoints; mThreadCount = 256; mMaxDetections = maxDetections; mIsDetection = isDetection; mIsSegmentation = isSegmentation; mIsPose = isPose; mIsObb = isObb; mAnchorCount = anchor_count; /* std::cout << "YoloLayerPlugin created with the following parameters:" << std::endl; std::cout << " Class Count: " << mClassCount << std::endl; std::cout << " Number of Points: " << mNumberOfPoints << std::endl; std::cout << " Confidence Threshold Keypoints: " << mConfThreshold << std::endl; std::cout << " Max Detections: " << mMaxDetections << std::endl; std::cout << " Is Detection: " << mIsDetection << std::endl; std::cout << " Is Segmentation: " << mIsSegmentation << std::endl; std::cout << " Is Pose: " << mIsPose << std::endl; std::cout << " Is OBB: " << mIsObb << std::endl; std::cout << " Anchor Count: " << mAnchorCount << std::endl; std::cout << " Strides: "; for (int i = 0; i < mStridesLength; ++i) { std::cout << mStrides[i] << " "; } std::cout << std::endl; */ } YoloLayerPlugin::~YoloLayerPlugin() {} YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mNumberOfPoints); read(d, mThreadCount); read(d, mMaxDetections); read(d, mIsDetection); read(d, mIsSegmentation); read(d, mIsPose); read(d, mIsObb); read(d, mAnchorCount); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mNumberOfPoints); write(d, mThreadCount); write(d, mMaxDetections); write(d, mIsDetection); write(d, mIsSegmentation); write(d, mIsPose); write(d, mIsObb); write(d, mAnchorCount); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mNumberOfPoints) + sizeof(mThreadCount) + sizeof(mMaxDetections) + sizeof(mIsDetection) + sizeof(mIsSegmentation) + sizeof(mIsPose) + sizeof(mIsObb) + sizeof(mAnchorCount); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxDetections * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int32_t nbInput, nvinfer1::PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT {} void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {} void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberOfPoints, mMaxDetections, mIsDetection, mIsSegmentation, mIsPose, mIsObb, mAnchorCount); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { gatherKernelLauncher(reinterpret_cast(inputs), reinterpret_cast(outputs[0]), stream, batchSize); return 0; } __device__ float Logist(float data) { return 1.f / (1.f + expf(-data)); } __global__ void gatherKernel(const float* input, float* output, int num_elements, int max_out_object, int class_count, int nk, int output_elem, bool is_detection, bool is_segmentation, bool is_pose, bool is_obb) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_elements) return; int outputIdx = 0 * output_elem; // TODO: ADD BATCH SUPPORT HERE int anchor_size = -1; float angle = 0.0f; if (is_detection) { anchor_size = 4 + class_count; } else if (is_obb) { anchor_size = 5 + class_count; angle = input[idx * (anchor_size) + 4 + class_count]; } float xmin = input[idx * (anchor_size) + 0]; float ymin = input[idx * (anchor_size) + 1]; float xmax = input[idx * (anchor_size) + 2]; float ymax = input[idx * (anchor_size) + 3]; float score = 0.0f; int class_id = -1; for (int c = 0; c < class_count; c++) { float conf = input[idx * (anchor_size) + 4 + c]; if (conf > score) { score = conf; class_id = c; } } if (score < d_confThreshold) { return; } int count = (int)atomicAdd(output + outputIdx, 1); if (count >= max_out_object) { return; } int det_size = sizeof(Detection) / sizeof(float); Detection* det = (Detection*)(output + outputIdx + 1 + count * det_size); /* float scale = fminf(640.0f / 1080.0f, 640.0f / 608.0f); // TODO: GET FROM PARAMETERS WITH SCALE! float offset_x = -scale * 1080.0f / 2.0f + 640.0f / 2.0f; // TODO: GET FROM PARAMETERS WITH OFFSET! float offset_y = -scale * 608.0f / 2.0f + 640.0f / 2.0f; // TODO: GET FROM PARAMETERS WITH OFFSET! det->conf = score; det->class_id = 1; // TODO: ADD CLASS ID HERE det->bbox[0] = (xmin - offset_x) / scale; det->bbox[1] = (ymin - offset_y) / scale; det->bbox[2] = (xmax - offset_x) / scale; det->bbox[3] = (ymax - offset_y) / scale; */ det->conf = score; det->class_id = class_id; det->bbox[0] = xmin; det->bbox[1] = ymin; det->bbox[2] = xmax; det->bbox[3] = ymax; if (is_obb) { det->angle = angle; } // TODO: ADD KEYPOINTS, SEGMENTATION, OBB HERE } void YoloLayerPlugin::gatherKernelLauncher(const float* const* inputs, float* outputs, cudaStream_t stream, int batchSize) { // TODO: ADD BATCH SUPPORT, CURRENTLY ONLY BATCH=1 IS SUPPORTED // TODO: ADD SEGMENTATION, POSE, OBB SUPPORT // TODO: num_elem = batch_size * anchor_num const float* input = inputs[0]; int outputElem = mMaxDetections * sizeof(Detection) / sizeof(float) + 1; int num_elem = mAnchorCount; // Use anchor count from model configuration dim3 blockSize(mThreadCount); dim3 gridSize((num_elem + mThreadCount - 1) / mThreadCount); cudaMemsetAsync(outputs, 0, batchSize * outputElem * sizeof(float), stream); // TODO: adjust for batch size gatherKernel<<>>(input, outputs, num_elem, mMaxDetections, mClassCount, mNumberOfPoints, outputElem, mIsDetection, mIsSegmentation, mIsPose, mIsObb); } PluginFieldCollection YoloLayerPluginCreator::mFC{}; std::vector YoloLayerPluginCreator::mPluginAttributes; YoloLayerPluginCreator::YoloLayerPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloLayerPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloLayerPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloLayerPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int net_info_count = fc->fields[0].length; int class_count = combinedInfo[0]; int number_of_points = combinedInfo[1]; int max_detections = combinedInfo[2]; bool is_detection = combinedInfo[3]; bool is_segmentation = combinedInfo[4]; bool is_pose = combinedInfo[5]; bool is_obb = combinedInfo[6]; int anchor_count = combinedInfo[7]; YoloLayerPlugin* plugin = new YoloLayerPlugin(class_count, number_of_points, max_detections, is_detection, is_segmentation, is_pose, is_obb, anchor_count); plugin->setPluginNamespace(mNamespace.c_str()); return plugin; } IPluginV2IOExt* YoloLayerPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { YoloLayerPlugin* plugin = new YoloLayerPlugin(serialData, serialLength); plugin->setPluginNamespace(mNamespace.c_str()); return plugin; } } // namespace nvinfer1 ================================================ FILE: yolo26/plugin/yololayer.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { void setPluginDeviceParams(float confThreshold); class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int numberOfPoints, int maxDetections, bool isDetection, bool isSegmentation, bool isPose, bool isObb, int anchor_count); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void gatherKernelLauncher(const float* const* inputs, float* outputs, cudaStream_t stream, int batchSize); int mThreadCount = 256; const char* mPluginNamespace = ""; int mClassCount; int mNumberOfPoints; int mMaxDetections; bool mIsDetection; bool mIsSegmentation; bool mIsPose; bool mIsObb; int mAnchorCount; }; class API YoloLayerPluginCreator : public IPluginCreator { public: YoloLayerPluginCreator(); const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override { mNamespace = pluginNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloLayerPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolo26/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "model.h" #include "yololayer.h" std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; //uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); uint32_t* val = reinterpret_cast(malloc(sizeof(uint32_t) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; } return WeightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); output->setName(lname.c_str()); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int g) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); conv->setNbGroups(g); conv->setName((lname + "/conv/Conv").c_str()); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); sigmoid->setName((lname + "/act/Sigmoid").c_str()); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); ew->setName((lname + "/act/Mul").c_str()); return ew; } nvinfer1::ILayer* conv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int g, bool act) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); conv->setNbGroups(g); conv->setName((lname + "/conv/Conv").c_str()); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); if (!act) return bn; nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); sigmoid->setName((lname + "/act/Sigmoid").c_str()); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); ew->setName((lname + "/act/Mul").c_str()); return ew; } static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, std::vector k1, std::vector k2, float e, std::string lname, int g = 1) { int c_ = (int)((float)c2 * e); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2", g); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); ew->setName((lname + ".add").c_str()); return ew; } return conv2; } static nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g = 1) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); return bn; } static nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float attn_ratio, std::string lname) { int head_dim = dim / num_heads; int key_dim = head_dim * attn_ratio; float scale = pow(key_dim, -0.5); int nh_kd = key_dim * num_heads; int h = dim + nh_kd * 2; auto d = input.getDimensions(); int B = d.d[0]; int H = d.d[2]; int W = d.d[3]; int N = H * W; auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv"); // qkv.view(B, self.num_heads, -1, N) auto shuffle = network->addShuffle(*qkv->getOutput(0)); shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N}); // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2) auto d1 = shuffle->getOutput(0)->getDimensions(); auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // attn = ((q.transpose(-2, -1) @ k) * self.scale) auto qT = network->addShuffle(*q->getOutput(0)); qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0), nvinfer1::MatrixOperation::kNONE); // There are not many memory leaks, and I will change it when I have time float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; nvinfer1::IScaleLayer* scaleLayer = network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); // attn = attn.softmax(dim=-1) nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0)); softmax->setAxes(1 << 3); // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W)) auto attnT = network->addShuffle(*softmax->getOutput(0)); attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0), nvinfer1::MatrixOperation::kNONE); auto reshape = network->addShuffle(*matmul2->getOutput(0)); reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); auto v_reshape = network->addShuffle(*v->getOutput(0)); v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False) auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim); auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // x = self.proj(x) // self.proj = Conv(dim, dim, 1, act=False) auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj"); return proj; } static nvinfer1::ILayer* PSABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, float attn_ratio, int num_heads, bool shortcut, std::string lname) { auto attn = Attention(network, weightMap, input, dim, num_heads, attn_ratio, lname + ".attn"); nvinfer1::ILayer* shortcut_layer = nullptr; if (shortcut) { shortcut_layer = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); } else { shortcut_layer = attn; } // self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False)) // x = x + self.ffn(x) if self.add else self.ffn(x) auto ffn0 = convBnSiLU(network, weightMap, *shortcut_layer->getOutput(0), dim * 2, {1, 1}, 1, lname + ".ffn.0"); auto ffn1 = convBn(network, weightMap, *ffn0->getOutput(0), dim, 1, 1, lname + ".ffn.1"); if (shortcut) { return network->addElementWise(*shortcut_layer->getOutput(0), *ffn1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); } else { return ffn1; } } static nvinfer1::ILayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, std::vector k1, std::vector k2, float e, std::string lname) { int c_ = (int)((float)c2 * e); auto cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); auto cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2"); nvinfer1::ITensor* y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, k1, k2, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 2); cat->setName((lname + ".cat").c_str()); auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3"); return cv3; } nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool c3k, bool shortcut, bool attn, float e, std::string lname) { int c_ = (int)((float)c2 * e); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); split1->setName((lname + ".split1").c_str()); nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); split2->setName((lname + ".split2").c_str()); nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); cat->setName((lname + ".cat0").c_str()); nvinfer1::ITensor* y1 = split2->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b = nullptr; if (attn) { b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i) + ".0"); b = PSABlock(network, weightMap, *b->getOutput(0), c_, 0.5, max(1, c_ / 64), shortcut, lname + ".m." + std::to_string(i) + ".1"); } else if (c3k) { b = C3k(network, weightMap, *y1, c_, c_, 2, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i)); } else { b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i)); } y1 = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); cat->setName((lname + ".cat" + std::to_string(i + 1)).c_str()); } nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, bool shortcut, std::string lname) { int c_ = c1 / 2; nvinfer1::ILayer* conv1 = conv(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1", 1, false); nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool3 = network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); if (shortcut && (c1 == c2)) { nvinfer1::IElementWiseLayer* sum = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return sum; } else { return conv2; } } nvinfer1::IElementWiseLayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname) { int c = c2 * e; // cv1 branch nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c, {1, 1}, 1, lname + ".cv1"); nvinfer1::ITensor* cv1_out = conv1->getOutput(0); // Split the output of cv1 into two tensors nvinfer1::Dims dims = cv1_out->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // Create y1 bottleneck sequence nvinfer1::ITensor* y = split2->getOutput(0); for (int i = 0; i < n; ++i) { auto* bottleneck_layer = PSABlock(network, weightMap, *y, c, 0.5, c / 64, true, lname + ".m." + std::to_string(i)); y = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck } // Concatenate y1 with the second split of cv1 nvinfer1::ITensor* concatInputs[2] = {split1->getOutput(0), y}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2); // cv2 to produce the final output nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c1, {1, 1}, 1, lname + ".cv2"); return conv2; } nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setNbGroups(ch); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::IPluginV2Layer* addYoloLayer(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input, const std::vector& strides, const std::vector& fm_sizes, int stridesLength, bool is_detection, bool is_segmentation, bool is_pose, bool is_obb, int anchorCount) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const int netinfo_count = 8; const int total_count = netinfo_count + stridesLength; int class_num = kNumClass; if (is_pose) { class_num = kPoseNumClass; } if (is_obb) { class_num = kObbNumClass; } std::vector combinedInfo(total_count); combinedInfo[0] = class_num; combinedInfo[1] = kNumberOfPoints; combinedInfo[2] = kMaxNumOutputBbox; combinedInfo[3] = is_detection; combinedInfo[4] = is_segmentation; combinedInfo[5] = is_pose; combinedInfo[6] = is_obb; combinedInfo[7] = anchorCount; nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); nvinfer1::PluginFieldCollection pluginFieldCollection; pluginFieldCollection.nbFields = 1; pluginFieldCollection.fields = &pluginField; nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // Use the single input tensor instead of multiple detection heads nvinfer1::ITensor* inputTensors[] = {&input}; nvinfer1::IPluginV2Layer* yololayer = network->addPluginV2(inputTensors, 1, *pluginObject); return yololayer; } ================================================ FILE: yolo26/src/model.cpp ================================================ #include #include #include "block.h" // #include "calibrator.h" #include "config.h" #include "model.h" static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = std::min(x, max_channels); channel = int(ceil((channel * gw) / divisor)) * divisor; return channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } nvinfer1::IHostMemory* buildEngineYolo26Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO26 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO26 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* block0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* block1 = convBnSiLU(network, weightMap, *block0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *block1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.2"); nvinfer1::IElementWiseLayer* block3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); nvinfer1::IElementWiseLayer* block4 = C3K2(network, weightMap, *block3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.4"); nvinfer1::IElementWiseLayer* block5 = convBnSiLU(network, weightMap, *block4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); nvinfer1::IElementWiseLayer* block6 = C3K2(network, weightMap, *block5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.6"); nvinfer1::IElementWiseLayer* block7 = convBnSiLU(network, weightMap, *block6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); nvinfer1::IElementWiseLayer* block8 = C3K2(network, weightMap, *block7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.8"); nvinfer1::IElementWiseLayer* block9 = SPPF(network, weightMap, *block8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, true, "model.9"); nvinfer1::IElementWiseLayer* block10 = C2PSA(network, weightMap, *block9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10"); /******************************************************************************************************* ********************************************* YOLO26 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*block10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensors12[] = {upsample11->getOutput(0), block6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensors12, 2); nvinfer1::IElementWiseLayer* block13 = C3K2(network, weightMap, *cat12->getOutput(0), get_width(1024, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*block13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensors15[] = {upsample14->getOutput(0), block4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensors15, 2); nvinfer1::IElementWiseLayer* block16 = C3K2(network, weightMap, *cat15->getOutput(0), get_width(512, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.16"); nvinfer1::IElementWiseLayer* block17 = convBnSiLU(network, weightMap, *block16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.17"); nvinfer1::ITensor* inputTensors18[] = {block17->getOutput(0), block13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensors18, 2); nvinfer1::IElementWiseLayer* block19 = C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.19"); nvinfer1::IElementWiseLayer* block20 = convBnSiLU(network, weightMap, *block19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.20"); nvinfer1::ITensor* inputTensors21[] = {block20->getOutput(0), block10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensors21, 2); nvinfer1::IElementWiseLayer* block22 = C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 1, true, true, true, 0.5, "model.22"); // WARN: get_depth(2, gd) changed to 1. /******************************************************************************************************* ********************************************* YOLO26 OUTPUT ******************************************** *******************************************************************************************************/ int c2 = std::max(std::max(16, get_width(256, gw, max_channels)), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_0 = convBnSiLU(network, weightMap, *block16->getOutput(0), c2, {3, 3}, 1, "model.23.one2one_cv3.0.0.0", c2); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.0.0.1", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.one2one_cv3.0.1.0", c3); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.0.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv3_0_2 = network->addConvolutionNd( *conv23_one2one_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_one2one_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv3_0_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_3 = network->addShuffle(*conv23_one2one_cv3_0_2->getOutput(0)); reshape23_3->setReshapeDimensions(nvinfer1::Dims3{1, kNumClass, -1}); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_0 = convBnSiLU( network, weightMap, *block19->getOutput(0), c2 * 2, {3, 3}, 1, "model.23.one2one_cv3.1.0.0", c2 * 2); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.1.0.1", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.one2one_cv3.1.1.0", c3); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.1.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv3_1_2 = network->addConvolutionNd( *conv23_one2one_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_one2one_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv3_1_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_4 = network->addShuffle(*conv23_one2one_cv3_1_2->getOutput(0)); reshape23_4->setReshapeDimensions(nvinfer1::Dims3{1, kNumClass, -1}); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_0; if (type == "m" || type == "l" || type == "x") { conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 2, {3, 3}, 1, "model.23.one2one_cv3.2.0.0", c2 * 2); } else { conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 4, {3, 3}, 1, "model.23.one2one_cv3.2.0.0", c2 * 4); } nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.2.0.1", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.one2one_cv3.2.1.0", c3); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.2.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv3_2_2 = network->addConvolutionNd( *conv23_one2one_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); conv23_one2one_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv3_2_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_5 = network->addShuffle(*conv23_one2one_cv3_2_2->getOutput(0)); reshape23_5->setReshapeDimensions(nvinfer1::Dims3{1, kNumClass, -1}); ///////////////////////////////////////////////////// nvinfer1::ITensor* inputTensors23_1[] = {reshape23_3->getOutput(0), reshape23_4->getOutput(0), reshape23_5->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensors23_1, 3); cat23_1->setAxis(2); nvinfer1::IActivationLayer* sigmoid23 = network->addActivation( *cat23_1->getOutput(0), nvinfer1::ActivationType::kSIGMOID); // TODO: THIS IS UNNESSARY, REMOVE AFTER PLUGIN IS READY ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_0 = convBnSiLU(network, weightMap, *block16->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.0.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv2_0_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.0.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv2_0_2 = network->addConvolutionNd( *conv23_one2one_cv2_0_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_one2one_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv2_0_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23 = network->addShuffle(*conv23_one2one_cv2_0_2->getOutput(0)); reshape23->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1}); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_0 = convBnSiLU(network, weightMap, *block19->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.1.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv2_1_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv2_1_2 = network->addConvolutionNd( *conv23_one2one_cv2_1_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_one2one_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv2_1_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_1 = network->addShuffle(*conv23_one2one_cv2_1_2->getOutput(0)); reshape23_1->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1}); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.2.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv2_2_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.2.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv2_2_2 = network->addConvolutionNd( *conv23_one2one_cv2_2_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); conv23_one2one_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv2_2_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_2 = network->addShuffle(*conv23_one2one_cv2_2_2->getOutput(0)); reshape23_2->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1}); ///////////////////////////////////////////////////// nvinfer1::ITensor* inputTensors23[] = {reshape23->getOutput(0), reshape23_1->getOutput(0), reshape23_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23 = network->addConcatenation(inputTensors23, 3); cat23->setAxis(2); ///////////////////////////////////////////////////// nvinfer1::ISliceLayer* slice23_1 = network->addSlice( *cat23->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2, cat23->getOutput(0)->getDimensions().d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* slice23 = network->addSlice( *cat23->getOutput(0), nvinfer1::Dims3{0, cat23->getOutput(0)->getDimensions().d[1] / 2, 0}, nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2, cat23->getOutput(0)->getDimensions().d[2]}, nvinfer1::Dims3{1, 1, 1}); // TODO: MAKE HARDCODED TO AUTOMATIC const int anchor_num = cat23->getOutput(0)->getDimensions().d[2]; std::vector fm_sizes; int fm_h_0 = block16->getOutput(0)->getDimensions().d[2]; // P3 int fm_h_1 = block19->getOutput(0)->getDimensions().d[2]; // P4 int fm_h_2 = block22->getOutput(0)->getDimensions().d[2]; // P5 fm_sizes.push_back(fm_h_0); fm_sizes.push_back(fm_h_1); fm_sizes.push_back(fm_h_2); std::vector strides = {kInputH / fm_h_0, kInputH / fm_h_1, kInputH / fm_h_2}; std::vector grid(anchor_num * 2); std::vector stride_vec(anchor_num); std::fill(stride_vec.begin(), stride_vec.begin() + fm_sizes[0] * fm_sizes[0], strides[0]); std::fill(stride_vec.begin() + fm_sizes[0] * fm_sizes[0], stride_vec.begin() + fm_sizes[0] * fm_sizes[0] + fm_sizes[1] * fm_sizes[1], strides[1]); std::fill(stride_vec.begin() + fm_sizes[0] * fm_sizes[0] + fm_sizes[1] * fm_sizes[1], stride_vec.end(), strides[2]); int idx = 0; for (int s = 0; s < fm_sizes.size(); ++s) { int h = fm_sizes[s]; int w = fm_sizes[s]; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { grid[idx] = x + 0.5f; grid[idx + anchor_num] = y + 0.5f; idx++; } } } nvinfer1::Dims gridDims; gridDims.nbDims = 3; gridDims.d[0] = 1; gridDims.d[1] = 2; gridDims.d[2] = anchor_num; nvinfer1::IConstantLayer* constant_grid = network->addConstant( gridDims, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, grid.data(), (int64_t)grid.size()}); nvinfer1::IElementWiseLayer* conv23_add_1 = network->addElementWise( *constant_grid->getOutput(0), *slice23->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::IElementWiseLayer* conv23_sub_1 = network->addElementWise( *constant_grid->getOutput(0), *slice23_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUB); nvinfer1::ITensor* tensor23[] = {conv23_sub_1->getOutput(0), conv23_add_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(tensor23, 2); cat23_2->setAxis(1); nvinfer1::IConstantLayer* constant_stride = network->addConstant( nvinfer1::Dims3{1, 1, anchor_num}, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, stride_vec.data(), (int64_t)stride_vec.size()}); nvinfer1::IElementWiseLayer* mul23_2 = network->addElementWise( *cat23_2->getOutput(0), *constant_stride->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); /////////////////////////////////////////////////////////// nvinfer1::IConcatenationLayer* cat23_3 = network->addConcatenation( std::array{mul23_2->getOutput(0), sigmoid23->getOutput(0)}.data(), 2); cat23_3->setAxis(1); nvinfer1::IShuffleLayer* transpose = network->addShuffle(*cat23_3->getOutput(0)); transpose->setFirstTranspose(nvinfer1::Permutation{0, 2, 1}); // transpose->setReshapeDimensions(nvinfer1::Dims3{1, anchor_num, kNumClass + 4}); /////////////////////////////////////////////////////////// int stridesLength = strides.size(); nvinfer1::IPluginV2Layer* yolo = addYoloLayer(network, *transpose->getOutput(0), strides, fm_sizes, stridesLength, true, false, false, false, anchor_num); assert(yolo); /////////////////////////////////////////////////////////// yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Use setMemoryPoolLimit instead of deprecated setMaxWorkspaceSize config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cerr << "INT8 not supported for YOLO26 model yet." << std::endl; #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolo26Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO26-Obb INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kObbInputH, kObbInputW}); assert(data); nvinfer1::IElementWiseLayer* block0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* block1 = convBnSiLU(network, weightMap, *block0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *block1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.2"); nvinfer1::IElementWiseLayer* block3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); nvinfer1::IElementWiseLayer* block4 = C3K2(network, weightMap, *block3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.4"); nvinfer1::IElementWiseLayer* block5 = convBnSiLU(network, weightMap, *block4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); nvinfer1::IElementWiseLayer* block6 = C3K2(network, weightMap, *block5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.6"); nvinfer1::IElementWiseLayer* block7 = convBnSiLU(network, weightMap, *block6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); nvinfer1::IElementWiseLayer* block8 = C3K2(network, weightMap, *block7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.8"); nvinfer1::IElementWiseLayer* block9 = SPPF(network, weightMap, *block8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, true, "model.9"); // TODO: VERIFY THIS BLOCK FOR OTHER YOLO26 MODELS nvinfer1::IElementWiseLayer* block10 = C2PSA(network, weightMap, *block9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.10"); /******************************************************************************************************* ********************************************* YOLO26-Obb HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*block10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensors12[] = {upsample11->getOutput(0), block6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensors12, 2); nvinfer1::IElementWiseLayer* block13 = C3K2(network, weightMap, *cat12->getOutput(0), get_width(1024, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*block13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensors15[] = {upsample14->getOutput(0), block4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensors15, 2); nvinfer1::IElementWiseLayer* block16 = C3K2(network, weightMap, *cat15->getOutput(0), get_width(512, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.16"); nvinfer1::IElementWiseLayer* block17 = convBnSiLU(network, weightMap, *block16->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.17"); nvinfer1::ITensor* inputTensors18[] = {block17->getOutput(0), block13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensors18, 2); nvinfer1::IElementWiseLayer* block19 = C3K2(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.19"); nvinfer1::IElementWiseLayer* block20 = convBnSiLU(network, weightMap, *block19->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.20"); nvinfer1::ITensor* inputTensors21[] = {block20->getOutput(0), block10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensors21, 2); nvinfer1::IElementWiseLayer* block22 = C3K2(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 1, true, true, true, 0.5, "model.22"); // WARN: get_depth(2, gd) changed to 1. /******************************************************************************************************* ********************************************* YOLO26-Obb OUTPUT ******************************************** *******************************************************************************************************/ int c2 = std::max(std::max(16, get_width(256, gw, max_channels)), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kObbNumClass, 100)); //cv.2.*.* ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_0 = convBnSiLU(network, weightMap, *block16->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.0.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv2_0_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.0.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv2_0_2 = network->addConvolutionNd( *conv23_one2one_cv2_0_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_one2one_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv2_0_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23 = network->addShuffle(*conv23_one2one_cv2_0_2->getOutput(0)); reshape23->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1}); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_0 = convBnSiLU(network, weightMap, *block19->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.1.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv2_1_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv2_1_2 = network->addConvolutionNd( *conv23_one2one_cv2_1_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_one2one_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv2_1_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_1 = network->addShuffle(*conv23_one2one_cv2_1_2->getOutput(0)); reshape23_1->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1}); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.2.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv2_2_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv2.2.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv2_2_2 = network->addConvolutionNd( *conv23_one2one_cv2_2_1->getOutput(0), 4, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); conv23_one2one_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv2_2_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_2 = network->addShuffle(*conv23_one2one_cv2_2_2->getOutput(0)); reshape23_2->setReshapeDimensions(nvinfer1::Dims3{1, 4, -1}); nvinfer1::ITensor* inputTensors23[] = {reshape23->getOutput(0), reshape23_1->getOutput(0), reshape23_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23 = network->addConcatenation(inputTensors23, 3); cat23->setAxis(2); //cv.4.*.* ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv4_0_0 = convBnSiLU(network, weightMap, *block16->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.0.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv4_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv4_0_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.0.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv4_0_2 = network->addConvolutionNd( *conv23_one2one_cv4_0_1->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv4.0.2.weight"], weightMap["model.23.one2one_cv4.0.2.bias"]); conv23_one2one_cv4_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv4_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv4_0_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_6 = network->addShuffle(*conv23_one2one_cv4_0_2->getOutput(0)); reshape23_6->setReshapeDimensions(nvinfer1::Dims3{1, 1, -1}); nvinfer1::IElementWiseLayer* conv23_one2one_cv4_1_0 = convBnSiLU(network, weightMap, *block19->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.1.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv4_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv4_1_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv4_1_2 = network->addConvolutionNd( *conv23_one2one_cv4_1_1->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv4.1.2.weight"], weightMap["model.23.one2one_cv4.1.2.bias"]); conv23_one2one_cv4_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv4_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv4_1_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_7 = network->addShuffle(*conv23_one2one_cv4_1_2->getOutput(0)); reshape23_7->setReshapeDimensions(nvinfer1::Dims3{1, 1, -1}); nvinfer1::IElementWiseLayer* conv23_one2one_cv4_2_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.2.0", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv4_2_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv4_2_0->getOutput(0), c2 / 4, {3, 3}, 1, "model.23.one2one_cv4.2.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv4_2_2 = network->addConvolutionNd( *conv23_one2one_cv4_2_1->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv4.2.2.weight"], weightMap["model.23.one2one_cv4.2.2.bias"]); conv23_one2one_cv4_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv4_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv4_2_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_8 = network->addShuffle(*conv23_one2one_cv4_2_2->getOutput(0)); reshape23_8->setReshapeDimensions(nvinfer1::Dims3{1, 1, -1}); nvinfer1::ITensor* inputTensors23_2[] = {reshape23_6->getOutput(0), reshape23_7->getOutput(0), reshape23_8->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensors23_2, 3); cat23_2->setAxis(2); ///////////////////////////////////////////////////// nvinfer1::ISliceLayer* split23__0 = network->addSlice( *cat23->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2, cat23->getOutput(0)->getDimensions().d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23__1 = network->addSlice( *cat23->getOutput(0), nvinfer1::Dims3{0, cat23->getOutput(0)->getDimensions().d[1] / 2, 0}, nvinfer1::Dims3{cat23->getOutput(0)->getDimensions().d[0], cat23->getOutput(0)->getDimensions().d[1] / 2, cat23->getOutput(0)->getDimensions().d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IElementWiseLayer* sub23 = network->addElementWise(*split23__1->getOutput(0), *split23__0->getOutput(0), nvinfer1::ElementWiseOperation::kSUB); // Divide by 2 static float two = 2.0f; nvinfer1::Weights two_weights{nvinfer1::DataType::kFLOAT, &two, 1}; nvinfer1::IConstantLayer* const_two = network->addConstant(nvinfer1::Dims3{1, 1, 1}, two_weights); nvinfer1::IElementWiseLayer* div23 = network->addElementWise(*sub23->getOutput(0), *const_two->getOutput(0), nvinfer1::ElementWiseOperation::kDIV); nvinfer1::ISliceLayer* split23_1__0 = network->addSlice( *div23->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{div23->getOutput(0)->getDimensions().d[0], div23->getOutput(0)->getDimensions().d[1] / 2, div23->getOutput(0)->getDimensions().d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1__1 = network->addSlice( *div23->getOutput(0), nvinfer1::Dims3{0, div23->getOutput(0)->getDimensions().d[1] / 2, 0}, nvinfer1::Dims3{div23->getOutput(0)->getDimensions().d[0], div23->getOutput(0)->getDimensions().d[1] / 2, div23->getOutput(0)->getDimensions().d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IUnaryLayer* cos23 = network->addUnary(*cat23_2->getOutput(0), nvinfer1::UnaryOperation::kCOS); nvinfer1::IUnaryLayer* sin23 = network->addUnary(*cat23_2->getOutput(0), nvinfer1::UnaryOperation::kSIN); nvinfer1::IElementWiseLayer* mul23 = network->addElementWise(*split23_1__0->getOutput(0), *cos23->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* mul23_1 = network->addElementWise(*split23_1__1->getOutput(0), *sin23->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* sub23_1 = network->addElementWise(*mul23->getOutput(0), *mul23_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUB); nvinfer1::IElementWiseLayer* mul23_2 = network->addElementWise(*split23_1__0->getOutput(0), *sin23->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* mul23_3 = network->addElementWise(*split23_1__1->getOutput(0), *cos23->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* add23 = network->addElementWise(*mul23_2->getOutput(0), *mul23_3->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::ITensor* tensor23[] = {sub23_1->getOutput(0), add23->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_3 = network->addConcatenation(tensor23, 2); cat23_3->setAxis(1); std::vector fm_sizes; int fm_h_0 = block16->getOutput(0)->getDimensions().d[2]; // P3 int fm_h_1 = block19->getOutput(0)->getDimensions().d[2]; // P4 int fm_h_2 = block22->getOutput(0)->getDimensions().d[2]; // P5 fm_sizes.push_back(fm_h_0); fm_sizes.push_back(fm_h_1); fm_sizes.push_back(fm_h_2); int grid_num = fm_h_0 * fm_h_0 + fm_h_1 * fm_h_1 + fm_h_2 * fm_h_2; assert((kObbInputH % fm_h_0) == 0 && (kObbInputH % fm_h_1) == 0 && (kObbInputH % fm_h_2) == 0); assert((fm_h_0 == block16->getOutput(0)->getDimensions().d[3]) && (fm_h_1 == block19->getOutput(0)->getDimensions().d[3]) && (fm_h_2 == block22->getOutput(0)->getDimensions().d[3])); // verify fm_w == fm_h assert(cat23_3->getOutput(0)->getDimensions().d[2] == grid_num); int idx = 0; std::vector grid(grid_num * 2); auto fill_grid = [&](int fm_h) { for (int y = 0; y < fm_h; ++y) { for (int x = 0; x < fm_h; ++x) { grid[idx] = x + 0.5f; grid[idx + grid_num] = y + 0.5f; idx++; } } }; fill_grid(fm_h_0); fill_grid(fm_h_1); fill_grid(fm_h_2); std::vector stride_vec(grid_num); idx = 0; auto fill_stride = [&](int fm_h, int fm_w, int stride) { for (int y = 0; y < fm_h; ++y) { for (int x = 0; x < fm_w; ++x) { stride_vec[idx] = static_cast(stride); idx++; } } }; std::vector strides = {kObbInputH / fm_h_0, kObbInputH / fm_h_1, kObbInputH / fm_h_2}; fill_stride(fm_h_0, fm_h_0, strides[0]); fill_stride(fm_h_1, fm_h_1, strides[1]); fill_stride(fm_h_2, fm_h_2, strides[2]); nvinfer1::Dims gridDims{3, {1, 2, grid_num}}; nvinfer1::IConstantLayer* constant_grid = network->addConstant( gridDims, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, grid.data(), (int64_t)grid.size()}); nvinfer1::Dims strideDims{3, {1, 1, grid_num}}; nvinfer1::IConstantLayer* constant_stride = network->addConstant( strideDims, nvinfer1::Weights{nvinfer1::DataType::kFLOAT, stride_vec.data(), (int64_t)stride_vec.size()}); nvinfer1::IElementWiseLayer* add23_1 = network->addElementWise(*cat23_3->getOutput(0), *constant_grid->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::IElementWiseLayer* add23_2 = network->addElementWise(*split23__0->getOutput(0), *split23__1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::ITensor* tensor23_4[] = {add23_1->getOutput(0), add23_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_4 = network->addConcatenation(tensor23_4, 2); cat23_4->setAxis(1); nvinfer1::IElementWiseLayer* mul23_4 = network->addElementWise( *cat23_4->getOutput(0), *constant_stride->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_0 = convBnSiLU(network, weightMap, *block16->getOutput(0), c2, {3, 3}, 1, "model.23.one2one_cv3.0.0.0", c2); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.0.0.1", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.one2one_cv3.0.1.0", c3); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.0.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv3_0_2 = network->addConvolutionNd( *conv23_one2one_cv3_0_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_one2one_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv3_0_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_3 = network->addShuffle(*conv23_one2one_cv3_0_2->getOutput(0)); reshape23_3->setReshapeDimensions(nvinfer1::Dims3{1, kObbNumClass, -1}); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_0 = convBnSiLU( network, weightMap, *block19->getOutput(0), c2 * 2, {3, 3}, 1, "model.23.one2one_cv3.1.0.0", c2 * 2); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.1.0.1", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.one2one_cv3.1.1.0", c3); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.1.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv3_1_2 = network->addConvolutionNd( *conv23_one2one_cv3_1_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_one2one_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv3_1_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_4 = network->addShuffle(*conv23_one2one_cv3_1_2->getOutput(0)); reshape23_4->setReshapeDimensions(nvinfer1::Dims3{1, kObbNumClass, -1}); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_0; if (type == "m" || type == "l" || type == "x") { conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 2, {3, 3}, 1, "model.23.one2one_cv3.2.0.0", c2 * 2); } else { conv23_one2one_cv3_2_0_0 = convBnSiLU(network, weightMap, *block22->getOutput(0), c2 * 4, {3, 3}, 1, "model.23.one2one_cv3.2.0.0", c2 * 4); } nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.2.0.1", 1); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.23.one2one_cv3.2.1.0", c3); nvinfer1::IElementWiseLayer* conv23_one2one_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_one2one_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.23.one2one_cv3.2.1.1", 1); nvinfer1::IConvolutionLayer* conv23_one2one_cv3_2_2 = network->addConvolutionNd( *conv23_one2one_cv3_2_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); conv23_one2one_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_one2one_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv23_one2one_cv3_2_2->setNbGroups(1); nvinfer1::IShuffleLayer* reshape23_5 = network->addShuffle(*conv23_one2one_cv3_2_2->getOutput(0)); reshape23_5->setReshapeDimensions(nvinfer1::Dims3{1, kObbNumClass, -1}); nvinfer1::ITensor* tensor23_1[] = {reshape23_3->getOutput(0), reshape23_4->getOutput(0), reshape23_5->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(tensor23_1, 3); cat23_1->setAxis(2); nvinfer1::IActivationLayer* sigmoid23 = network->addActivation( *cat23_1->getOutput(0), nvinfer1::ActivationType::kSIGMOID); // TODO: THIS IS UNNESSARY, REMOVE AFTER PLUGIN IS READY ///////////////////////////////////////////////////// nvinfer1::ITensor* tensor23_5[] = {mul23_4->getOutput(0), sigmoid23->getOutput(0), cat23_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_5 = network->addConcatenation(tensor23_5, 3); cat23_5->setAxis(1); nvinfer1::IShuffleLayer* transpose = network->addShuffle(*cat23_5->getOutput(0)); transpose->setFirstTranspose(nvinfer1::Permutation{0, 2, 1}); nvinfer1::IPluginV2Layer* yolo = addYoloLayer(network, *transpose->getOutput(0), strides, fm_sizes, strides.size(), false, false, false, true, grid_num); ///////////////////////////////////////////////////// yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Use setMemoryPoolLimit instead of deprecated setMaxWorkspaceSize config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cerr << "INT8 not supported for YOLO26 model yet." << std::endl; #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolo26Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO26 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO26 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* block0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* block1 = convBnSiLU(network, weightMap, *block0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *block1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.2"); nvinfer1::IElementWiseLayer* block3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); nvinfer1::IElementWiseLayer* block4 = C3K2(network, weightMap, *block3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, false, 0.25, "model.4"); nvinfer1::IElementWiseLayer* block5 = convBnSiLU(network, weightMap, *block4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); nvinfer1::IElementWiseLayer* block6 = C3K2(network, weightMap, *block5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.6"); nvinfer1::IElementWiseLayer* block7 = convBnSiLU(network, weightMap, *block6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); nvinfer1::IElementWiseLayer* block8 = C3K2(network, weightMap, *block7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, false, 0.5, "model.8"); nvinfer1::IElementWiseLayer* block9 = C2PSA(network, weightMap, *block8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), 0.5, "model.9"); ///////////////////////////////////////////////////// nvinfer1::IElementWiseLayer* block10_convbn = convBnSiLU(network, weightMap, *block9->getOutput(0), 1280, {1, 1}, 1, "model.10.conv"); nvinfer1::Dims dims = block10_convbn->getOutput(0)->getDimensions(); // Obtain the dimensions of the output of conv_class assert(dims.nbDims == 4); nvinfer1::IPoolingLayer* block10_pool = network->addPoolingNd( *block10_convbn->getOutput(0), nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{dims.d[2], dims.d[3]}); nvinfer1::IShuffleLayer* block10_reshape = network->addShuffle(*block10_pool->getOutput(0)); block10_reshape->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, 1280}); nvinfer1::IConstantLayer* block10_linear_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, 1280}, weightMap["model.10.linear.weight"]); nvinfer1::IConstantLayer* block10_linear_bias = network->addConstant(nvinfer1::Dims2{kClsNumClass, 1}, weightMap["model.10.linear.bias"]); nvinfer1::IMatrixMultiplyLayer* block10_linear_matrix_multiply = network->addMatrixMultiply(*block10_reshape->getOutput(0), nvinfer1::MatrixOperation::kNONE, *block10_linear_weight->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE); nvinfer1::IElementWiseLayer* block10_linear_add = network->addElementWise(*block10_linear_matrix_multiply->getOutput(0), *block10_linear_bias->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::IActivationLayer* output = network->addActivation(*block10_linear_add->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(output); output->getOutput(0)->setName(kOutputTensorName); network->markOutput(*output->getOutput(0)); // Use setMemoryPoolLimit instead of deprecated setMaxWorkspaceSize config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cerr << "INT8 not supported for YOLO26 model yet." << std::endl; #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolo26/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kObbInputW / (img.cols * 1.0); float r_h = kObbInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kObbInputH - r_w * img.rows) / 2; b = bbox[3] - (kObbInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kObbInputW - r_h * img.cols) / 2; r = bbox[2] - (kObbInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; // lmk[i + 2] } } else { l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; // lmk[i + 2] } } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), (std::min)(lbox[2], rbox[2]), (std::max)(lbox[1], rbox[1]), (std::min)(lbox[3], rbox[3]), }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; return interBoxS / unionBoxS; } static bool cmp(const Detection& a, const Detection& b) { if (a.conf == b.conf) { return a.bbox[0] < b.bbox[0]; } return a.conf > b.conf; } void decode(std::vector& res, float* output) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); res.push_back(det); } } void batch_decode(std::vector>& res_batch, float* output, int batch_size, int output_size) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { decode(res_batch[i], &output[i * output_size]); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch) { const std::vector> skeleton_pairs = { {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); for (int k = 0; k < kNumberOfPoints * 3; k += 3) { if (res[j].keypoints[k + 2] > 0.5) { cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, cv::Scalar(0, 0x27, 0xC1), -1); } } for (const auto& bone : skeleton_pairs) { int kp1_idx = bone.first * 3; int kp2_idx = bone.second * 3; if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); } } } } } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } std::tuple convariance_matrix(Detection res) { float w = res.bbox[2]; float h = res.bbox[3]; float a = w * w / 12.0; float b = h * h / 12.0; float c = res.angle; float cos_r = std::cos(c); float sin_r = std::sin(c); float cos_r2 = cos_r * cos_r; float sin_r2 = sin_r * sin_r; float a_val = a * cos_r2 + b * sin_r2; float b_val = a * sin_r2 + b * cos_r2; float c_val = (a - b) * cos_r * sin_r; return std::make_tuple(a_val, b_val, c_val); } static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; std::tuple matrix1 = {a1, b1, c1}; std::tuple matrix2 = {a2, b2, c2}; matrix1 = convariance_matrix(res1); matrix2 = convariance_matrix(res2); a1 = std::get<0>(matrix1); b1 = std::get<1>(matrix1); c1 = std::get<2>(matrix1); a2 = std::get<0>(matrix2); b2 = std::get<1>(matrix2); c2 = std::get<2>(matrix2); float x1 = res1.bbox[0], y1 = res1.bbox[1]; float x2 = res2.bbox[0], y2 = res2.bbox[1]; float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t3 = std::log( ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) / (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = std::max(std::min(bd, 100.0f), eps); float hd = std::sqrt(1.0 - std::exp(-bd) + eps); return 1 - hd; } void decode_obb(std::vector& res, float* output) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); res.push_back(det); } } void batch_decode_obb(std::vector>& res_batch, float* output, int batch_size, int output_size) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { decode_obb(res_batch[i], &output[i * output_size]); } } static std::vector get_corner(cv::Mat& img, const Detection& box) { float cos_value, sin_value; // Calculate center point and width/height float x1 = box.bbox[0]; float y1 = box.bbox[1]; float w = box.bbox[2]; float h = box.bbox[3]; float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees // Print original angle std::cout << "Original angle: " << angle << std::endl; // Swap width and height if height is greater than or equal to width if (h >= w) { std::swap(w, h); angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180) } // Ensure the angle is between 0 and 180 degrees if (angle < 0) { angle += 360.0f; // Convert to positive value } if (angle > 180.0f) { angle -= 180.0f; // Subtract 180 from angles greater than 180 } // Print adjusted angle std::cout << "Adjusted angle: " << angle << std::endl; // Convert to normal angle value float normal_angle = fmod(angle, 180.0f); if (normal_angle < 0) { normal_angle += 180.0f; // Ensure it's a positive value } // Print normal angle value std::cout << "Normal angle: " << normal_angle << std::endl; cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians sin_value = std::sin(angle * CV_PI / 180.0f); // Calculate each corner point float l = x1 - w / 2; // Left boundary float r = x1 + w / 2; // Right boundary float t = y1 - h / 2; // Top boundary float b = y1 + h / 2; // Bottom boundary // Use get_rect function to scale the coordinates float bbox[4] = {l, t, r, b}; cv::Rect rect = get_rect_obb(img, bbox); float x_ = (rect.x + rect.x + rect.width) / 2; // Center x float y_ = (rect.y + rect.y + rect.height) / 2; // Center y float width = rect.width; // Width float height = rect.height; // Height // Calculate each corner point std::vector corner_points(4); float vec1x = width / 2 * cos_value; float vec1y = width / 2 * sin_value; float vec2x = -height / 2 * sin_value; float vec2y = height / 2 * cos_value; corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner corner_points[2] = cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner // Check and adjust corner points to ensure the rectangle is parallel to image boundaries for (auto& point : corner_points) { point.x = std::max(0, std::min(point.x, img.cols - 1)); point.y = std::max(0, std::min(point.y, img.rows - 1)); } return corner_points; } void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; auto& img = img_batch[i]; for (auto& obj : res) { auto color = colors[(int)obj.class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); auto corner_points = get_corner(img, obj); cv::polylines(img, std::vector>{corner_points}, true, bgr, 1); auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf)); cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr); int width = textsize.width; int height = textsize.height; bool outside = (corner_points[0].y - height >= 3) ? true : false; cv::Point p1(corner_points[0].x, corner_points[0].y), p2; p2.x = corner_points[0].x + width; if (outside) { p2.y = corner_points[0].y - height - 3; } else { p2.y = corner_points[0].y + height + 3; } cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA); cv::putText( img, text, cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)), 0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA); } } } ================================================ FILE: yolo26/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolo26/yolo26_cls.cpp ================================================ #include #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "types.h" #include "utils.h" #include "yololayer.h" Logger gLogger; using namespace nvinfer1; const static int kOutputSize = kClsNumClass; void batch_preprocess(std::vector& imgs, float* output, int dst_width = 224, int dst_height = 224) { for (size_t b = 0; b < imgs.size(); b++) { int h = imgs[b].rows; int w = imgs[b].cols; int m = std::min(h, w); int top = (h - m) / 2; int left = (w - m) / 2; cv::Mat img = imgs[b](cv::Rect(left, top, m, m)); cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR); cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img.convertTo(img, CV_32F, 1 / 255.0); std::vector channels(3); cv::split(img, channels); // CHW format for (int c = 0; c < 3; ++c) { int i = 0; for (int row = 0; row < dst_height; ++row) { for (int col = 0; col < dst_width; ++col) { output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] = channels[c].at(row, col); ++i; } } } } } void serialize_engine(const std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = buildEngineYolo26Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** input_buffer_host, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); *input_buffer_host = new float[kBatchSize * 3 * kClsInputH * kClsInputW]; *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } std::vector topk(const std::vector& vec, int k) { std::vector topk_index; std::vector vec_index(vec.size()); std::iota(vec_index.begin(), vec_index.end(), 0); std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; }); int k_num = std::min(vec.size(), k); for (int i = 0; i < k_num; ++i) { topk_index.push_back(vec_index[i]); } return topk_index; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; int model_bboxes = 0; float gd = 0, gw = 0; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo26_cls -s [.wts] [.engine] [n/s/m/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolo26_cls -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Prepare cpu and gpu buffers float* device_buffers[2]; float* input_buffer_host = nullptr; float* output_buffer_host = nullptr; prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &input_buffer_host, &output_buffer_host); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // Read imagenet labels auto classes = read_classes("imagenet_classes.txt"); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess batch_preprocess(img_batch, input_buffer_host, kClsInputW, kClsInputH); std::ofstream p("engine_input.txt"); if (!p) { std::cout << "could not open input file" << std::endl; assert(false); } for (int i = 0; i < kBatchSize * 3 * kClsInputH * kClsInputW; i++) { p << input_buffer_host[i] << "\n"; } p.close(); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)device_buffers, input_buffer_host, output_buffer_host, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // Postprocess and get top-k result for (size_t b = 0; b < img_name_batch.size(); b++) { float* p = &output_buffer_host[b * kOutputSize]; std::vector prob(p, p + kOutputSize); auto topk_idx = topk(prob, 3); std::cout << img_name_batch[b] << std::endl; for (auto idx : topk_idx) { std::cout << " " << classes[idx] << " " << p[idx] << std::endl; } } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] input_buffer_host; delete[] output_buffer_host; delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolo26/yolo26_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "types.h" #include "utils.h" #include "yololayer.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(const std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = buildEngineYolo26Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, int model_bboxes) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; CUDA_CHECK(cudaStreamSynchronize(stream)); } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; int model_bboxes = 0; float gd = 0, gw = 0; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo26_det -s [.wts] [.engine] [n/s/m/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolo26_det -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; // WARN: If you change kMaxNumOutputBbox, it must be smaller than the value kMaxNumOutputBbox in config.h, // otherwise there will be memory overflow! // Or you should modify the config.h and recompile. setPluginDeviceParams(kConfThresh); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host); for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, model_bboxes); std::vector> res_batch; batch_decode(res_batch, output_buffer_host, kBatchSize, kOutputSize); // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolo26/yolo26_obb.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "types.h" #include "utils.h" #include "yololayer.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(const std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = buildEngineYolo26Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kObbInputH * kObbInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, int model_bboxes) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; CUDA_CHECK(cudaStreamSynchronize(stream)); } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; int model_bboxes; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo26_obb -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolo26_obb -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; setPluginDeviceParams(kConfThresh); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kObbInputW, kObbInputH, stream); infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, model_bboxes); std::vector> res_batch; batch_decode_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize); draw_bbox_obb(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolop/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolop) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Release) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) find_package(OpenCV REQUIRED) include_directories(${OpenCV_INCLUDE_DIRS}) # cuda include_directories(/usr/local/cuda-10.2/include) link_directories(/usr/local/cuda-10.2/lib64) # tensorrt include_directories(/usr/include/aarch64-linux-gnu/) link_directories(/usr/lib/aarch64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") # to generate plugins cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) # to generate trt and test image dir add_executable(yolop ${PROJECT_SOURCE_DIR}/yolop.cpp) target_link_libraries(yolop nvinfer cudart myplugins ${OpenCV_LIBS}) add_definitions(-O3 -pthread) ================================================ FILE: yolop/README.md ================================================ YoloP ===== The original pytorch model is from [hustvl/YOLOP](https://github.com/hustvl/YOLOP) ## Authors ## 1. Prepare building environments Make sure you have install `c++`(support c++11)、 `cmake`、`opencv`(4.x)、`cuda`(10.x)、`nvinfer`(7.x). ## 2. build yolop Go to `yolop`. ``` mkdir build cd build cmake .. make ``` Now you can get `yolop` and `libmyplugins.so`. ## 3. Test in C++ Go to `yolop/build`. ### 3.1 generate yolop.wts Download/Clone [YOLOP](https://github.com/hustvl/YOLOP) Edit `gen_wts.py` , change `YOLOP_BASE_DIR` to realpath of `YOLOP`. ``` # [WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP python3 ../gen_wts.py ``` ### 3.2 generate yolop.trt ``` ./yolop -s yolop.wts yolop.trt ``` Now you have such files: `libmyplugins.so yolop yolop.wts yolop.trt` ### 3.3 test yolop.trt ``` mkdir ../results YOLOP_BASE_DIR=/home/user/jetson/tmp/YOLOP ./yolop -d yolop.trt $YOLOP_BASE_DIR/inference/images/ ``` It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`) ``` 1601ms # the fist time is slow 26ms # then it is faster 29ms 27ms 29ms 29ms ``` ![](https://user-images.githubusercontent.com/4545060/197756635-38348dc5-d8e7-4ae3-be56-6b231dd2f5db.jpg) ## 4. Test in python3 Go to `yolop`. Make sure you have install `pycuda` `tensorrt`; and modify `image_dir` to your image dir. ``` # usage: xxx python3 yolop_trt.py build/yolop.trt build/libmyplugins.so /home/user/jetson/tmp/YOLOP/inference/images ``` It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`) ``` usage: xxx [WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images bingding: data (3, 384, 640) bingding: det (6001, 1, 1) bingding: seg (1, 360, 640) bingding: lane (1, 360, 640) batch size is 1 warm_up->(384, 640, 3), time->1070.87ms input->['/home/user/jetson/tmp/YOLOP/inference/images/3c0e7240-96e390d2.jpg'], time->25.94ms, saving into output/ input->['/home/user/jetson/tmp/YOLOP/inference/images/adb4871d-4d063244.jpg'], time->25.34ms, saving into output/ input->['/home/user/jetson/tmp/YOLOP/inference/images/8e1c1ab0-a8b92173.jpg'], time->25.03ms, saving into output/ input->['/home/user/jetson/tmp/YOLOP/inference/images/7dd9ef45-f197db95.jpg'], time->25.45ms, saving into output/ input->['/home/user/jetson/tmp/YOLOP/inference/images/9aa94005-ff1d4c9a.jpg'], time->24.93ms, saving into output/ input->['/home/user/jetson/tmp/YOLOP/inference/images/0ace96c3-48481887.jpg'], time->25.33ms, saving into output/ done! ``` ![](https://user-images.githubusercontent.com/4545060/198003852-204f3bae-18ad-44fb-9ecd-4a2a07a726a3.jpg) **Notice** : The results of c++ and python are not aligned for now! ---------------------------------------- ```BibTeX @misc{2108.11250, Author = {Dong Wu and Manwen Liao and Weitian Zhang and Xinggang Wang}, Title = {YOLOP: You Only Look Once for Panoptic Driving Perception}, Year = {2021}, Eprint = {arXiv:2108.11250}, } ``` ================================================ FILE: yolop/common.hpp ================================================ #pragma once #include #include #include #include #include #include "NvInfer.h" #include "yololayer.h" using namespace nvinfer1; cv::Rect get_rect(cv::Mat& img, float bbox[4]) { int l, r, t, b; float r_w = Yolo::INPUT_W / (img.cols * 1.0); float r_h = Yolo::INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2] / 2.f; r = bbox[0] + bbox[2] / 2.f; t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; t = bbox[1] - bbox[3] / 2.f; b = bbox[1] + bbox[3] / 2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(l, t, r - l, b - t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]); return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.conf > b.conf; } void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { int det_size = sizeof(Yolo::Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Yolo::Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int p = ksize / 2; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); conv1->setNbGroups(g); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); // silu = x * sigmoid // auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID); // assert(sig); // auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD); // assert(ew); // hard_swish = x * hard_sigmoid auto hsig = network->addActivation(*bn1->getOutput(0), ActivationType::kHARD_SIGMOID); assert(hsig); hsig->setAlpha(1.0 / 6.0); hsig->setBeta(0.5); auto ew = network->addElementWise(*bn1->getOutput(0), *hsig->getOutput(0), ElementWiseOperation::kPROD); assert(ew); return ew; } ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); return conv; } ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); if (shortcut && c1 == c2) { auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); return ew; } return cv2; } ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int c_ = (int)((float)c2 * e); auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts); ITensor *y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts); ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); return cv4; } ILayer* C3(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { int c_ = (int)((float)c2 * e); auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2"); ITensor *y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } ITensor* inputTensors[] = { y1, cv2->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3"); return cv3; } ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { int c_ = c1 / 2; auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 }); pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 }); pool1->setStrideNd(DimsHW{ 1, 1 }); auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 }); pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 }); pool2->setStrideNd(DimsHW{ 1, 1 }); auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 }); pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 }); pool3->setStrideNd(DimsHW{ 1, 1 }); ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); return cv2; } ILayer* preprocess_layer(INetworkDefinition *network, std::map& weightMap, ITensor& input) { // rescale auto rescale = network->addResize(input); rescale->setOutputDimensions(Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W }); rescale->setResizeMode(ResizeMode::kLINEAR); // normalize // long len = 3 * Yolo::IMG_H * Yolo::IMG_W; // float *normval = reinterpret_cast(malloc(sizeof(float) * len)); // for (size_t i = 0; i < len; ++i) { // normval[i] = 255.0; // } // Weights norm{ DataType::kFLOAT, normval, len }; // weightMap["prep.norm"] = norm; // auto constant = network->addConstant(Dims3{ 3, Yolo::IMG_H, Yolo::IMG_W }, norm); // auto normalize = network->addElementWise(*rescale->getOutput(0), *constant->getOutput(0), ElementWiseOperation::kDIV); //paddng auto padding = network->addPaddingNd(*rescale->getOutput(0), DimsHW{ (Yolo::INPUT_H - Yolo::IMG_H) / 2, (Yolo::INPUT_W - Yolo::IMG_W) / 2 }, DimsHW{ (Yolo::INPUT_H - Yolo::IMG_H) / 2, (Yolo::INPUT_W - Yolo::IMG_W) / 2 }); assert(padding); return padding; } std::vector getAnchors(std::map& weightMap) { std::vector anchors_yolo; Weights Yolo_Anchors = weightMap["model.24.anchor_grid"]; assert(Yolo_Anchors.count == 18); int each_yololayer_anchorsnum = Yolo_Anchors.count / 3; const float* tempAnchors = (const float*)(Yolo_Anchors.values); for (int i = 0; i < Yolo_Anchors.count; i++) { if (i < each_yololayer_anchorsnum) { anchors_yolo.push_back(const_cast(tempAnchors)[i]); } if ((i >= each_yololayer_anchorsnum) && (i < (2 * each_yololayer_anchorsnum))) { anchors_yolo.push_back(const_cast(tempAnchors)[i]); } if (i >= (2 * each_yololayer_anchorsnum)) { anchors_yolo.push_back(const_cast(tempAnchors)[i]); } } return anchors_yolo; } IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); std::vector anchors_yolo = getAnchors(weightMap); PluginField pluginMultidata[4]; int NetData[4]; NetData[0] = Yolo::CLASS_NUM; NetData[1] = Yolo::INPUT_W; NetData[2] = Yolo::INPUT_H; NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT; pluginMultidata[0].data = NetData; pluginMultidata[0].length = 3; pluginMultidata[0].name = "netdata"; pluginMultidata[0].type = PluginFieldType::kFLOAT32; int scale[3] = { 8, 16, 32 }; int plugindata[3][8]; std::string names[3]; for (int k = 1; k < 4; k++) { plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1]; plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1]; for (int i = 2; i < 8; i++) { plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]); } pluginMultidata[k].data = plugindata[k - 1]; pluginMultidata[k].length = 8; names[k - 1] = "yolodata" + std::to_string(k); pluginMultidata[k].name = names[k - 1].c_str(); pluginMultidata[k].type = PluginFieldType::kFLOAT32; } PluginFieldCollection pluginData; pluginData.nbFields = 4; pluginData.fields = pluginMultidata; IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData); ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) }; auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); return yolo; } ================================================ FILE: yolop/cuda_utils.h ================================================ #pragma once #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK ================================================ FILE: yolop/gen_wts.py ================================================ import os, sys import torch import struct # TODO: YOLOP_BASE_DIR is the root of YOLOP print("[WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP") #YOLOP_BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) YOLOP_BASE_DIR = "/home/user/jetson/tmp/YOLOP" sys.path.append(YOLOP_BASE_DIR) from lib.models import get_net from lib.config import cfg # Initialize device = torch.device('cpu') # Load model model = get_net(cfg) checkpoint = torch.load(YOLOP_BASE_DIR + '/weights/End-to-end.pth', map_location=device) model.load_state_dict(checkpoint['state_dict']) # load to FP32 model.float() model.to(device).eval() f = open('yolop.wts', 'w') f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f',float(vv)).hex()) f.write('\n') f.close() print("save as yolop.wts") ================================================ FILE: yolop/logging.h ================================================ // create by ausk(jinlj) 2022/10/25 #pragma once #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #else #define TRT_NOEXCEPT #endif using Severity = nvinfer1::ILogger::Severity; class Logger : public nvinfer1::ILogger { public: void log(Severity severity, const char* msg) TRT_NOEXCEPT override { if (severity < Severity::kINFO) { std::cout << msg << std::endl; } } }; ================================================ FILE: yolop/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolop/utils.h ================================================ #pragma once #include #include #include #include "common.hpp" #define SHOW_IMG static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols*1.0); float r_h = input_h / (img.rows*1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); cv::Mat tensor; out.convertTo(tensor, CV_32FC3, 1.f / 255.f); cv::subtract(tensor, cv::Scalar(0.485, 0.456, 0.406), tensor, cv::noArray(), -1); cv::divide(tensor, cv::Scalar(0.229, 0.224, 0.225), tensor, 1, -1); return tensor; } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } ================================================ FILE: yolop/yololayer.cu ================================================ #include #include #include #include "yololayer.h" #include "cuda_utils.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel) { mClassCount = classCount; mNetWidth = netWidth; mNetHeight = netHeight; mMaxOutObject = maxOut; mYoloKernel = vYoloKernel; mKernelCount = vYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaFree(mAnchor[ii])); } CUDA_CHECK(cudaFreeHost(mAnchor)); } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); read(d, mNetWidth); read(d, mNetHeight); read(d, mMaxOutObject); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(mYoloKernel.data(), d, kernelSize); d += kernelSize; CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); write(d, mNetWidth); write(d, mNetHeight); write(d, mMaxOutObject); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(d, mYoloKernel.data(), kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mNetWidth) + sizeof(mNetHeight) + sizeof(mMaxOutObject); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNetWidth, mNetHeight, mMaxOutObject, mYoloKernel); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int noElements, const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid * bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < 3; ++k) { float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (box_prob < IGNORE_THRESH) continue; int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float *res_count = output + bnIdx * outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= maxoutobject) return; char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location // pytorch: // y = x[i].sigmoid() // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh // X: (sigmoid(tx) + cx)/FeaturemapW * netwidth det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth; det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight; // W: (Pw * e^tw) / FeaturemapW * netwidth // v5: https://github.com/ultralytics/yolov5/issues/471 det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k]; det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1]; det->conf = box_prob * max_cls_prob; det->class_id = class_id; } } void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float))); } int numElem = 0; for (unsigned int i = 0; i < mYoloKernel.size(); ++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width*yolo.height*batchSize; if (numElem < mThreadCount) mThreadCount = numElem; //printf("Net: %d %d \n", mNetWidth, mNetHeight); CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> > (inputs[i], output, numElem, mNetWidth, mNetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem); } } int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { int class_count = -1; int input_w = -1; int input_h = -1; int max_output_object_count = -1; std::vector yolo_kernels(3); const PluginField* fields = fc->fields; for (int i = 0; i < fc->nbFields; i++) { if (strcmp(fields[i].name, "netdata") == 0) { assert(fields[i].type == PluginFieldType::kFLOAT32); int *tmp = (int*)(fields[i].data); class_count = tmp[0]; input_w = tmp[1]; input_h = tmp[2]; max_output_object_count = tmp[3]; } else if (strstr(fields[i].name, "yolodata") != NULL) { assert(fields[i].type == PluginFieldType::kFLOAT32); int *tmp = (int*)(fields[i].data); YoloKernel kernel; kernel.width = tmp[0]; kernel.height = tmp[1]; for (int j = 0; j < fields[i].length - 2; j++) { kernel.anchors[j] = tmp[j + 2]; } yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel; } } assert(class_count && input_w && input_h && max_output_object_count); YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolop/yololayer.h ================================================ #ifndef _YOLO_LAYER_H #define _YOLO_LAYER_H #include #include #include "NvInfer.h" #include "macros.h" namespace Yolo { static constexpr int CHECK_COUNT = 3; static constexpr float IGNORE_THRESH = 0.1f; struct YoloKernel { int width; int height; float anchors[CHECK_COUNT * 2]; }; static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 1; static constexpr int INPUT_H = 384; static constexpr int INPUT_W = 640; static constexpr int IMG_H = 360; static constexpr int IMG_W = 640; static constexpr int LOCATIONS = 4; struct alignas(float) Detection { //center_x center_y w h float bbox[LOCATIONS]; float conf; // bbox_conf * cls_conf float class_id; }; } namespace nvinfer1 { class YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; using IPluginV2Ext::configurePlugin; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1); int mThreadCount = 256; const char* mPluginNamespace; int mKernelCount; int mClassCount; int mNetWidth; int mNetHeight; int mMaxOutObject; std::vector mYoloKernel; void** mAnchor; }; class YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif ================================================ FILE: yolop/yolop.cpp ================================================ #include "yolop.hpp" int main(int argc, char** argv) { cudaSetDevice(DEVICE); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolop -s [.wts] [.engine] // serialize model to plan file" << std::endl; std::cerr << "./yolop -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // create a model using the API directly and serialize it to a stream if (!wts_name.empty()) { IHostMemory* modelStream{ nullptr }; APIToModel(BATCH_SIZE, &modelStream, wts_name); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } // deserialize the .engine and run inference std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; return -1; } char *trtModelStream = nullptr; size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; static int seg_out[BATCH_SIZE * IMG_H * IMG_W]; static int lane_out[BATCH_SIZE * IMG_H * IMG_W]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; assert(engine->getNbBindings() == 4); void* buffers[4]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); const int output_det_index = engine->getBindingIndex(OUTPUT_DET_NAME); const int output_seg_index = engine->getBindingIndex(OUTPUT_SEG_NAME); const int output_lane_index = engine->getBindingIndex(OUTPUT_LANE_NAME); assert(inputIndex == 0); assert(output_det_index == 1); assert(output_seg_index == 2); assert(output_lane_index == 3); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[output_det_index], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[output_seg_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int))); CUDA_CHECK(cudaMalloc(&buffers[output_lane_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // store seg results cv::Mat tmp_seg(IMG_H, IMG_W, CV_32S, seg_out); // store lane results cv::Mat tmp_lane(IMG_H, IMG_W, CV_32S, lane_out); // PrintMat(tmp_seg); std::vector segColor; segColor.push_back(cv::Vec3b(0, 0, 0)); segColor.push_back(cv::Vec3b(0, 255, 0)); segColor.push_back(cv::Vec3b(255, 0, 0)); std::vector laneColor; laneColor.push_back(cv::Vec3b(0, 0, 0)); laneColor.push_back(cv::Vec3b(0, 0, 255)); laneColor.push_back(cv::Vec3b(0, 0, 0)); int fcount = 0; // set for batch-inference for (int f = 0; f < (int)file_names.size(); f++) { fcount++; if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; // preprocess ~3ms for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]); // load image takes ~17ms if (img.empty()) continue; //cv::cvtColor(img, img, cv::COLOR_BGR2RGB); cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox int i = 0; // BGR to RGB and normalize for (int row = 0; row < INPUT_H; ++row) { float* uc_pixel = pr_img.ptr(row); for (int col = 0; col < INPUT_W; ++col) { data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[0]; data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1]; data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[2]; uc_pixel += 3; ++i; } } } // Run inference auto start = std::chrono::system_clock::now(); doInferenceCpu(*context, stream, buffers, data, prob, seg_out, lane_out, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // postprocess ~0ms std::vector> batch_res(fcount); for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); } // show results for (int b = 0; b < fcount; ++b) { auto& res = batch_res[b]; //std::cout << res.size() << std::endl; cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]); // handling seg and lane results cv::Mat seg_res(img.rows, img.cols, CV_32S); cv::resize(tmp_seg, seg_res, seg_res.size(), 0, 0, cv::INTER_NEAREST); cv::Mat lane_res(img.rows, img.cols, CV_32S); cv::resize(tmp_lane, lane_res, lane_res.size(), 0, 0, cv::INTER_NEAREST); for (int row = 0; row < img.rows; ++row) { uchar* pdata = img.data + row * img.step; for (int col = 0; col < img.cols; ++col) { int seg_idx = seg_res.at(row, col); int lane_idx = lane_res.at(row, col); //std::cout << "enter" << ix << std::endl; for (int i = 0; i < 3; ++i) { if (lane_idx) { if (i != 2) pdata[i] = pdata[i] / 2 + laneColor[lane_idx][i] / 2; } else if (seg_idx) pdata[i] = pdata[i] / 2 + segColor[seg_idx][i] / 2; } pdata += 3; } } // handling det results for (size_t j = 0; j < res.size(); ++j) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("../results/_" + file_names[f - fcount + 1 + b], img); } fcount = 0; } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[output_det_index])); CUDA_CHECK(cudaFree(buffers[output_seg_index])); CUDA_CHECK(cudaFree(buffers[output_lane_index])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: yolop/yolop.hpp ================================================ #pragma once #include #include "cuda_utils.h" #include "logging.h" #include "utils.h" #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.45 #define CONF_THRESH 0.25 #define BATCH_SIZE 1 // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int IMG_H = Yolo::IMG_H; static const int IMG_W = Yolo::IMG_W; static const int CLASS_NUM = Yolo::CLASS_NUM; static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_DET_NAME = "det"; const char* OUTPUT_SEG_NAME = "seg"; const char* OUTPUT_LANE_NAME = "lane"; static Logger gLogger; ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); assert(data); std::map weightMap = loadWeights(wts_name); Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; // yolop backbone // auto focus0 = focus(network, weightMap, *shuffle->getOutput(0), 3, 32, 3, "model.0"); auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0"); auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1"); auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2"); auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3"); auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4"); auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5"); auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6"); auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7"); auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8"); // yolop head auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9"); auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10"); float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); for (int i = 0; i < 256 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 }; IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts); deconv11->setStrideNd(DimsHW{ 2, 2 }); deconv11->setNbGroups(256); weightMap["deconv11"] = deconvwts11; ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) }; auto cat12 = network->addConcatenation(inputTensors12, 2); auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13"); auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14"); Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 }; IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts); deconv15->setStrideNd(DimsHW{ 2, 2 }); deconv15->setNbGroups(128); ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) }; auto cat16 = network->addConcatenation(inputTensors16, 2); auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17"); IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18"); ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; auto cat19 = network->addConcatenation(inputTensors19, 2); auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20"); IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21"); ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; auto cat22 = network->addConcatenation(inputTensors22, 2); auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23"); IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); auto detect24 = addYoLoLayer(network, weightMap, det0, det1, det2); detect24->getOutput(0)->setName(OUTPUT_DET_NAME); auto conv25 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.25"); // upsample 26 Weights deconvwts26{ DataType::kFLOAT, deval, 128 * 2 * 2 }; IDeconvolutionLayer* deconv26 = network->addDeconvolutionNd(*conv25->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts26, emptywts); deconv26->setStrideNd(DimsHW{ 2, 2 }); deconv26->setNbGroups(128); auto bottleneck_csp27 = bottleneckCSP(network, weightMap, *deconv26->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.27"); auto conv28 = convBlock(network, weightMap, *bottleneck_csp27->getOutput(0), 32, 3, 1, 1, "model.28"); // upsample 29 Weights deconvwts29{ DataType::kFLOAT, deval, 32 * 2 * 2 }; IDeconvolutionLayer* deconv29 = network->addDeconvolutionNd(*conv28->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts29, emptywts); deconv29->setStrideNd(DimsHW{ 2, 2 }); deconv29->setNbGroups(32); auto conv30 = convBlock(network, weightMap, *deconv29->getOutput(0), 16, 3, 1, 1, "model.30"); auto bottleneck_csp31 = bottleneckCSP(network, weightMap, *conv30->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.31"); // upsample32 Weights deconvwts32{ DataType::kFLOAT, deval, 8 * 2 * 2 }; IDeconvolutionLayer* deconv32 = network->addDeconvolutionNd(*bottleneck_csp31->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts32, emptywts); deconv32->setStrideNd(DimsHW{ 2, 2 }); deconv32->setNbGroups(8); auto conv33 = convBlock(network, weightMap, *deconv32->getOutput(0), 2, 3, 1, 1, "model.33"); // segmentation output ISliceLayer *slicelayer = network->addSlice(*conv33->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 }); auto segout = network->addTopK(*slicelayer->getOutput(0), TopKOperation::kMAX, 1, 1); segout->getOutput(1)->setName(OUTPUT_SEG_NAME); auto conv34 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.34"); // upsample35 Weights deconvwts35{ DataType::kFLOAT, deval, 128 * 2 * 2 }; IDeconvolutionLayer* deconv35 = network->addDeconvolutionNd(*conv34->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts35, emptywts); deconv35->setStrideNd(DimsHW{ 2, 2 }); deconv35->setNbGroups(128); auto bottleneck_csp36 = bottleneckCSP(network, weightMap, *deconv35->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.36"); auto conv37 = convBlock(network, weightMap, *bottleneck_csp36->getOutput(0), 32, 3, 1, 1, "model.37"); // upsample38 Weights deconvwts38{ DataType::kFLOAT, deval, 32 * 2 * 2 }; IDeconvolutionLayer* deconv38 = network->addDeconvolutionNd(*conv37->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts38, emptywts); deconv38->setStrideNd(DimsHW{ 2, 2 }); deconv38->setNbGroups(32); auto conv39 = convBlock(network, weightMap, *deconv38->getOutput(0), 16, 3, 1, 1, "model.39"); auto bottleneck_csp40 = bottleneckCSP(network, weightMap, *conv39->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.40"); // upsample41 Weights deconvwts41{ DataType::kFLOAT, deval, 8 * 2 * 2 }; IDeconvolutionLayer* deconv41 = network->addDeconvolutionNd(*bottleneck_csp40->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts41, emptywts); deconv41->setStrideNd(DimsHW{ 2, 2 }); deconv41->setNbGroups(8); auto conv42 = convBlock(network, weightMap, *deconv41->getOutput(0), 2, 3, 1, 1, "model.42"); // lane-det output ISliceLayer *laneSlice = network->addSlice(*conv42->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 }); auto laneout = network->addTopK(*laneSlice->getOutput(0), TopKOperation::kMAX, 1, 1); laneout->getOutput(1)->setName(OUTPUT_LANE_NAME); // detection output network->markOutput(*detect24->getOutput(0)); // segmentation output network->markOutput(*segout->getOutput(1)); // lane output network->markOutput(*laneout->getOutput(1)); assert(false); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(2L * (1L << 30)); // 2GB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) { // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, int* seg_output, int* lane_output, int batchSize) { // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 4) { wts = std::string(argv[2]); engine = std::string(argv[3]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } ================================================ FILE: yolop/yolop_trt.py ================================================ # 2022/10/26 by ausk """ An example that uses TensorRT's Python api to make yolop inferences. """ import ctypes import os import shutil import random import sys import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA) class YolopTRT(object): """ description: Warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding: ', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) self.input_h = 384 self.input_w = 640 self.img_h = 360 self.img_w = 640 # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. for i in range(len(host_outputs)): cuda.memcpy_dtoh_async(host_outputs[i], cuda_outputs[i], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 detout = host_outputs[0] segout = host_outputs[1].reshape( (self.batch_size, self.img_h,self.img_w)) laneout = host_outputs[2].reshape( (self.batch_size, self.img_h,self.img_w)) # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( detout[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image img = batch_image_raw[i] nh = img.shape[0] nw = img.shape[1] for j in range(len(result_boxes)): box = result_boxes[j] label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j]) plot_one_box( box, img, label=label) seg = cv2.resize(segout[i], (nw, nh), interpolation=cv2.INTER_NEAREST) lane = cv2.resize(laneout[i], (nw, nh), interpolation=cv2.INTER_NEAREST) color_area = np.zeros_like(img) color_area[seg==1] = (0,255,0) color_area[lane==1] = (0,0,255) color_mask = np.mean(color_area, 2) img[color_mask != 0] = img[color_mask != 0] * 0.5 + color_area[color_mask != 0] * 0.5 img = img.astype(np.uint8) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (114, 114, 114) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 image = (image - (0.485, 0.456, 0.406)) /(0.229, 0.224, 0.225) # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 6))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/yolop.trt" print("usage: xxx ") print("[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images") IMAGE_DIR = "/home/user/jetson/tmp/YOLOP/inference/images" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] if len(sys.argv) > 3: IMAGE_DIR = sys.argv[3] ctypes.CDLL(PLUGIN_LIBRARY) categories = ["car"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YolopTRT instance yolop_wrapper = YolopTRT(engine_file_path) try: print('batch size is', yolop_wrapper.batch_size) image_dir = IMAGE_DIR image_path_batches = get_img_path_batches(yolop_wrapper.batch_size, image_dir) for i in range(1): batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) for batch in image_path_batches: batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image(batch)) for i, img_path in enumerate(batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(batch, use_time * 1000)) finally: # destroy the instance yolop_wrapper.destroy() print("done!") ================================================ FILE: yolov10/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov10) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/workspace/shared/TensorRT-8.4.3.1/include) link_directories(/workspace/shared/TensorRT-8.4.3.1/lib) # include_directories(/home/lindsay/TensorRT-7.2.3.4/include) # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolov10_det ${PROJECT_SOURCE_DIR}/yolov10_det.cpp ${SRCS}) target_link_libraries(yolov10_det nvinfer) target_link_libraries(yolov10_det cudart) target_link_libraries(yolov10_det myplugins) target_link_libraries(yolov10_det ${OpenCV_LIBS}) ================================================ FILE: yolov10/README.md ================================================ ## Introduce Yolov10 model supports TensorRT-8. ## Environment CUDA: 11.8 CUDNN: 8.9.1.23 TensorRT: TensorRT-8.2.5.1 / GPU: RTX1650 TensorRT: TensorRT-8.4.3.1 / GPU: RTX4070 ``` # faq Error Code 1: Internal Error (Unsupported SM: 0x809) The architecture of the higher version does not support the use of the earlier version of TensorRT, and you need to upgrade the TensorRT version ``` ## Support * [x] YOLOv10-det support FP32/FP16/INT8 and Python/C++ API ## Config * Choose the YOLOv10 sub-model n/s/m/b/l/x from command line arguments. * Other configs please check [src/config.h](src/config.h) ## Build and Run 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell git clone https://github.com/THU-MIG/yolov10.git cd yolov10/ wget https://github.com/THU-MIG/yolov10/releases/download/v1.1/yolov10n.pt git clone https://github.com/wang-xinyu/tensorrtx.git cp [PATH-TO-TENSORRTX]/yolov10/gen_wts.py . python gen_wts.py -w yolov10n.pt -o yolov10n.wts # A file 'yolov10n.wts' will be generated. ``` 2. build tensorrtx/yolov10 and run #### Detection ```shell cd [PATH-TO-TENSORRTX]/yolov10 # add test images mkdir images cp [PATH-TO-TENSORRTX]/yolov3-spp/samples/*.jpg ./images # Update kNumClass in src/config.h if your model is trained on custom dataset mkdir build cd build cp [PATH-TO-yolov10]/yolov10n.wts . cmake .. make # Build and serialize TensorRT engine ./yolov10_det -s yolov10n.wts yolov10n.engine [n/s/m/b/l/x] # Run inference ./yolov10_det -d yolov10n.engine ../images # The results are displayed in the console ``` 3. Optional, load and run the tensorrt model in Python ```shell // Install python-tensorrt, pycuda, etc. // Ensure the yolov10n.engine python yolov10_det_trt.py ./build/yolov10n.engine ./build/libmyplugins.so ``` ## INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolov10/build 3. set the macro `USE_INT8` in src/config.h and make again 4. serialize the model and test ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov10/gen_wts.py ================================================ # -*- coding: UTF-8 -*- """ @Author: mpj @Date : 2024/7/22 下午9:17 @version V1.0 """ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', default='./weights/yolov10n.pt', help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output pt_file, wts_file = parse_args() # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False) # Load FP32 weights model = model['ema' if model.get('ema') else 'model'].float() model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') print(f'success {wts_file}!!!') ================================================ FILE: yolov10/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" std::map loadWeights(const std::string file); nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g = 1); nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num); nvinfer1::ILayer* SCDown(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname); nvinfer1::ILayer* PSA(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::string lname); nvinfer1::ILayer* C2fCIB(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, bool lk, float e, std::string lname); ================================================ FILE: yolov10/include/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolov10/include/config.h ================================================ //#define USE_FP32 #define USE_FP16 // #define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static int kNumClass = 80; const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static float kConfThresh = 0.5f; const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; //Quantization input image folder path const static char* kInputQuantizationFolder = "./coco_calib"; ================================================ FILE: yolov10/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov10/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov10/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov10/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolov10DetN(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov10DetS(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov10DetM(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov10DetBL(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov10DetX(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); ================================================ FILE: yolov10/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]); void draw_bbox(std::vector& img_batch, std::vector>& res_batch); void batch_topk(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, int topk = 300); ================================================ FILE: yolov10/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov10/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; }; struct AffineMatrix { float value[6]; }; const int bbox_element = sizeof(Detection) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolov10/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); // std::cout << "Found file: " << cur_file_name << std::endl; file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov10/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides, int stridesLength) { mClassCount = classCount; mYoloV10NetWidth = netWidth; mYoloV10netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mYoloV10NetWidth); read(d, mYoloV10netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mYoloV10NetWidth); write(d, mYoloV10netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV10netHeight) + sizeof(mYoloV10NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength; } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV10NetWidth, mYoloV10netHeight, mMaxOutObject, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV10netHeight, mYoloV10NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; int total_grid = grid_h * grid_w; int info_len = 4 + classes; int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV10netHeight, int mYoloV10NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; // const int maxGrids = mStridesLength; // int grids[maxGrids][2]; // for (int i = 0; i < maxGrids; ++i) { // grids[i][0] = mYoloV10netHeight / mStrides[i]; // grids[i][1] = mYoloV10NetWidth / mStrides[i]; // } int maxGrids = mStridesLength; int flatGridsLen = 2 * maxGrids; int* flatGrids = new int[flatGridsLen]; for (int i = 0; i < maxGrids; ++i) { flatGrids[2 * i] = mYoloV10netHeight / mStrides[i]; flatGrids[2 * i + 1] = mYoloV10NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { // Access the elements of the original 2D array from the flattened 1D array int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements if (numElem < mThreadCount) // Adjust the thread count if needed mThreadCount = numElem; // The CUDA kernel call remains unchanged CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem); } delete[] flatGrids; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int netinfo_count = 4; int class_count = combinedInfo[0]; int input_w = combinedInfo[1]; int input_h = combinedInfo[2]; int max_output_object_count = combinedInfo[3]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov10/plugin/yololayer.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV10netHeight, int mYoloV10NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; int mYoloV10NetWidth; int mYoloV10netHeight; int mMaxOutObject; int* mStrides; int mStridesLength; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov10/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "yololayer.h" std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; } return WeightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g = 1) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); return bn; } nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) { nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, lname + ".cv2"); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return conv2; } nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname) { int c_ = (float)c2 * e; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, lname + ".cv1"); nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* y1 = split2->getOutput(0); for (int i = 0; i < n; i++) { auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2"); return conv2; } nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname) { assert(network != nullptr); int hidden_channels = static_cast(c2 * e); // cv1 branch nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, lname + ".cv1"); nvinfer1::ITensor* cv1_out = conv1->getOutput(0); // Split the output of cv1 into two tensors nvinfer1::Dims dims = cv1_out->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // Create y1 bottleneck sequence nvinfer1::ITensor* y1 = split1->getOutput(0); for (int i = 0; i < n; ++i) { auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0, lname + ".m." + std::to_string(i)); y1 = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck } // Concatenate y1 with the second split of cv1 nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2); // cv2 to produce the final output nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2"); return conv2; } nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname) { int c_ = c1 / 2; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, lname + ".cv1"); nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool3 = network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2"); return conv2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) { nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); softmax->setAxes(1 << 1); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const int netinfo_count = 4; // Assuming the first 5 elements are for netinfo as per existing code. const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. std::vector combinedInfo(total_count); // Fill in the first 5 elements as per existing netinfo. combinedInfo[0] = kNumClass; combinedInfo[1] = kInputW; combinedInfo[2] = kInputH; combinedInfo[3] = kMaxNumOutputBbox; // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); // Now let's create the PluginField object to hold this combined information. nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); // Create the PluginFieldCollection to hold the PluginField object. nvinfer1::PluginFieldCollection pluginFieldCollection{}; pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array pluginFieldCollection.fields = &pluginField; // Create the plugin object using the PluginFieldCollection. nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // We assume that the plugin is to be added onto the network. // Prepare input tensors for the YOLO Layer. std::vector inputTensors; for (auto det : dets) { inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. } // Add the plugin to the network using the prepared input tensors. nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); return yoloLayer; // Return the added YOLO layer. } nvinfer1::ILayer* SCDown(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname) { auto* conv1 = convBnSiLU(network, weightMap, input, ch, 1, 1, lname + ".cv1"); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv2 = network->addConvolutionNd(*conv1->getOutput(0), ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".cv2.conv.weight"], bias_empty); assert(conv2); conv2->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv2->setPaddingNd(nvinfer1::DimsHW{p, p}); conv2->setNbGroups(ch); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".cv2.bn", 1e-3); assert(bn); return bn; } nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float attn_ratio, std::string lname) { int head_dim = dim / num_heads; int key_dim = head_dim * attn_ratio; float scale = pow(key_dim, -0.5); int nh_kd = key_dim * num_heads; int h = dim + nh_kd * 2; auto d = input.getDimensions(); int B = d.d[0]; int H = d.d[2]; int W = d.d[3]; int N = H * W; auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv"); // qkv.view(B, self.num_heads, -1, N) auto shuffle = network->addShuffle(*qkv->getOutput(0)); shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N}); // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2) auto d1 = shuffle->getOutput(0)->getDimensions(); auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // attn = ((q.transpose(-2, -1) @ k) * self.scale) auto qT = network->addShuffle(*q->getOutput(0)); qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0), nvinfer1::MatrixOperation::kNONE); // There are not many memory leaks, and I will change it when I have time float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; nvinfer1::IScaleLayer* scaleLayer = network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); // attn = attn.softmax(dim=-1) nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0)); softmax->setAxes(1 << 3); // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W)) auto attnT = network->addShuffle(*softmax->getOutput(0)); attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0), nvinfer1::MatrixOperation::kNONE); auto reshape = network->addShuffle(*matmul2->getOutput(0)); reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); auto v_reshape = network->addShuffle(*v->getOutput(0)); v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False) auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim); auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // x = self.proj(x) // self.proj = Conv(dim, dim, 1, act=False) auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj"); return proj; } nvinfer1::ILayer* PSA(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::string lname) { int c = int(ch * 0.5); auto conv1 = convBnSiLU(network, weightMap, input, c * 2, 1, 1, lname + ".cv1"); // a, b = split((self.c, self.c), dim=1) auto d1 = conv1->getOutput(0)->getDimensions(); auto a = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d1.d[0], c, d1.d[2], d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto b = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, c, 0, 0}, nvinfer1::Dims4{d1.d[0], c, d1.d[2], d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // b = b + self.attn(b) auto attn = Attention(network, weightMap, *b->getOutput(0), c, c / 64, 0.5f, lname + ".attn"); auto sum = network->addElementWise(*b->getOutput(0), *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // b = b + self.ffn(b) // self.ffn = nn.Sequential( // Conv(self.c, self.c * 2, 1), // Conv(self.c * 2, self.c, 1, act=False) // ) auto ffn1 = convBnSiLU(network, weightMap, *sum->getOutput(0), c * 2, 1, 1, lname + ".ffn.0"); auto ffn2 = convBn(network, weightMap, *ffn1->getOutput(0), c, 1, 1, lname + ".ffn.1"); auto sum2 = network->addElementWise(*sum->getOutput(0), *ffn2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // self.cv2(torch.cat((a, b), 1)) nvinfer1::ITensor* inputTensors[] = {a->getOutput(0), sum2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2); auto conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), ch, 1, 1, lname + ".cv2"); return conv2; } nvinfer1::ILayer* RepVGGDW(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::string lname) { // self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False) // self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False) // self.dim = ed // self.act = nn.SiLU() // return self.act(self.conv(x) + self.conv1(x)) auto conv = convBn(network, weightMap, input, ch, 7, 1, lname + ".conv", ch); auto conv1 = convBn(network, weightMap, input, ch, 3, 1, lname + ".conv1", ch); auto ew = network->addElementWise(*conv->getOutput(0), *conv1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); auto sigmoid = network->addActivation(*ew->getOutput(0), nvinfer1::ActivationType::kSIGMOID); auto ew_silu = network->addElementWise(*ew->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew_silu); return ew_silu; } nvinfer1::ILayer* CIB(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, bool lk, std::string lname) { // self.cv1 = nn.Sequential( // Conv(c1, c1, 3, g=c1), // Conv(c1, 2 * c_, 1), // Conv(2 * c_, 2 * c_, 3, g=2 * c_) if not lk else RepVGGDW(2 * c_), // Conv(2 * c_, c2, 1), // Conv(c2, c2, 3, g=c2), // ) int c_ = (float)c2 * e; auto* conv1 = convBnSiLU(network, weightMap, input, c1, 3, 1, lname + ".cv1.0", c1); auto* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), 2 * c_, 1, 1, lname + ".cv1.1"); nvinfer1::ILayer* conv3; if (!lk) { conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), 2 * c_, 3, 1, lname + ".cv1.2", 2 * c_); } else { conv3 = RepVGGDW(network, weightMap, *conv2->getOutput(0), 2 * c_, lname + ".cv1.2"); } auto* conv4 = convBnSiLU(network, weightMap, *conv3->getOutput(0), c2, 1, 1, lname + ".cv1.3"); auto* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), c2, 3, 1, lname + ".cv1.4", c2); if (shortcut && c1 == c2) { auto* ew = network->addElementWise(input, *conv5->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } else { return conv5; } } nvinfer1::ILayer* C2fCIB(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, bool lk, float e, std::string lname) { int c_ = (float)c2 * e; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, lname + ".cv1"); nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* y1 = split2->getOutput(0); for (int i = 0; i < n; i++) { auto* b = CIB(network, weightMap, *y1, c_, c_, shortcut, 1.0, lk, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, lname + ".cv2"); return conv2; } ================================================ FILE: yolov10/src/calibrator.cpp ================================================ #include "calibrator.h" #include #include #include #include #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov10/src/model.cpp ================================================ #include #include #include "block.h" #include "calibrator.h" #include "config.h" #include "model.h" static int get_width(int x, float gw, int max_channels, int divisor = 8) { int c = std::min(x, max_channels); auto channel = int(ceil((c * gw) / divisor)) * divisor; return channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } void calculateStrides(nvinfer1::ILayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } nvinfer1::IHostMemory* buildEngineYolov10DetN(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLOV10 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV10 BACKBONE ******************************************** *******************************************************************************************************/ auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0"); auto* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1"); // 11233 auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); auto* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3"); // 22466 auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5"); // 22466 auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7"); // 11233 auto* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10"); /******************************************************************************************************* ********************************************* YOLOV10 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); auto* conv13 = C2F(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16"); auto* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); auto* conv19 = C2F(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.19"); auto* conv20 = SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, true, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLOV10 OUTPUT ****************************************** *******************************************************************************************************/ auto d = conv16->getOutput(0)->getDimensions(); assert(d.nbDims == 4); int ch_0 = d.d[1]; int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4)); int base_out_channel = std::max(ch_0, std::min(kNumClass, 100)); // output0 auto* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0"); auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd( *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels)); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.0.1"); auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.0.1.0", base_out_channel); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.1.1"); auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 auto* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0"); auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd( *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels)); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.0.1"); auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.1.1.0", base_out_channel); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.1.1"); auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 auto* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0"); auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd( *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3, 1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels)); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.0.1"); auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.2.1.0", base_out_channel); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.1.1"); auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLOV10 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2); cat23_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2); cat23_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov10DetS(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLOV10 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV10 BACKBONE ******************************************** *******************************************************************************************************/ auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0"); auto* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1"); // 11233 auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); auto* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3"); // 22466 auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5"); // 22466 auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7"); // 11233 auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, true, 0.5, "model.8"); auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10"); /******************************************************************************************************* ********************************************* YOLOV10 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); auto* conv13 = C2F(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16"); auto* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); auto* conv19 = C2F(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.19"); auto* conv20 = SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, true, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLOV10 OUTPUT ****************************************** *******************************************************************************************************/ auto d = conv16->getOutput(0)->getDimensions(); assert(d.nbDims == 4); int ch_0 = d.d[1]; int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4)); int base_out_channel = std::max(ch_0, std::min(kNumClass, 100)); // output0 auto* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0"); auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd( *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels)); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.0.1"); auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.0.1.0", base_out_channel); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.1.1"); auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 auto* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0"); auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd( *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels)); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.0.1"); auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.1.1.0", base_out_channel); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.1.1"); auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 auto* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0"); auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd( *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3, 1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels)); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.0.1"); auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.2.1.0", base_out_channel); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.1.1"); auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLOV10 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2); cat23_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2); cat23_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov10DetM(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLOV10 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV10 BACKBONE ******************************************** *******************************************************************************************************/ auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0"); auto* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1"); // 11233 auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); auto* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3"); // 22466 auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5"); // 22466 auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7"); // 11233 auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.8"); auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10"); /******************************************************************************************************* ********************************************* YOLOV10 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); auto* conv13 = C2F(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16"); auto* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); auto* conv19 = C2fCIB(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.19"); auto* conv20 = SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLOV10 OUTPUT ****************************************** *******************************************************************************************************/ auto d = conv16->getOutput(0)->getDimensions(); assert(d.nbDims == 4); int ch_0 = d.d[1]; int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4)); int base_out_channel = std::max(ch_0, std::min(kNumClass, 100)); // output0 auto* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0"); auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd( *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels)); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.0.1"); auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.0.1.0", base_out_channel); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.1.1"); auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 auto* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0"); auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd( *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels)); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.0.1"); auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.1.1.0", base_out_channel); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.1.1"); auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 auto* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0"); auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd( *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3, 1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels)); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.0.1"); auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.2.1.0", base_out_channel); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.1.1"); auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLOV10 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2); cat23_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2); cat23_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov10DetBL(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLOV10 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV10 BACKBONE ******************************************** *******************************************************************************************************/ auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0"); auto* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1"); // 11233 auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); auto* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3"); // 22466 auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5"); // 22466 auto* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7"); // 11233 auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.8"); auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10"); /******************************************************************************************************* ********************************************* YOLOV10 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); auto* conv13 = C2fCIB(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16"); auto* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); auto* conv19 = C2fCIB(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.19"); auto* conv20 = SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLOV10 OUTPUT ****************************************** *******************************************************************************************************/ auto d = conv16->getOutput(0)->getDimensions(); assert(d.nbDims == 4); int ch_0 = d.d[1]; int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4)); int base_out_channel = std::max(ch_0, std::min(kNumClass, 100)); // output0 auto* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0"); auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd( *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels)); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.0.1"); auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.0.1.0", base_out_channel); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.1.1"); auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 auto* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0"); auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd( *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels)); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.0.1"); auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.1.1.0", base_out_channel); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.1.1"); auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 auto* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0"); auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd( *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3, 1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels)); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.0.1"); auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.2.1.0", base_out_channel); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.1.1"); auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLOV10 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2); cat23_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2); cat23_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov10DetX(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLOV10 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV10 BACKBONE ******************************************** *******************************************************************************************************/ auto* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, "model.0"); auto* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, "model.1"); // 11233 auto* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); auto* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.3"); // 22466 auto* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); auto* conv5 = SCDown(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.5"); // 22466 auto* conv6 = C2fCIB(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, false, 0.5, "model.6"); auto* conv7 = SCDown(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, "model.7"); // 11233 auto* conv8 = C2fCIB(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.8"); auto* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); auto* conv10 = PSA(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), "model.10"); /******************************************************************************************************* ********************************************* YOLOV10 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 4); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); auto* conv13 = C2fCIB(network, weightMap, *cat12->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.13"); nvinfer1::IResizeLayer* upsample14 = network->addResize(*conv13->getOutput(0)); assert(upsample14); upsample14->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample14->setScales(scale, 4); nvinfer1::ITensor* inputTensor15[] = {upsample14->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat15 = network->addConcatenation(inputTensor15, 2); auto* conv16 = C2F(network, weightMap, *cat15->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.16"); auto* conv17 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 2, "model.17"); nvinfer1::ITensor* inputTensor18[] = {conv17->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); auto* conv19 = C2fCIB(network, weightMap, *cat18->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.19"); auto* conv20 = SCDown(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 2, "model.20"); nvinfer1::ITensor* inputTensor21[] = {conv20->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21 = network->addConcatenation(inputTensor21, 2); auto* conv22 = C2fCIB(network, weightMap, *cat21->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, false, 0.5, "model.22"); /******************************************************************************************************* ********************************************* YOLOV10 OUTPUT ****************************************** *******************************************************************************************************/ auto d = conv16->getOutput(0)->getDimensions(); assert(d.nbDims == 4); int ch_0 = d.d[1]; int base_in_channel = std::max(16, std::max(ch_0 / 4, 16 * 4)); int base_out_channel = std::max(ch_0, std::min(kNumClass, 100)); // output0 auto* conv23_cv2_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.0"); auto* conv23_cv2_0_1 = convBnSiLU(network, weightMap, *conv23_cv2_0_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.0.1"); nvinfer1::IConvolutionLayer* conv23_cv2_0_2 = network->addConvolutionNd( *conv23_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.0.2.weight"], weightMap["model.23.one2one_cv2.0.2.bias"]); conv23_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_0_0_0 = convBnSiLU(network, weightMap, *conv16->getOutput(0), get_width(256, gw, max_channels), 3, 1, "model.23.one2one_cv3.0.0.0", get_width(256, gw, max_channels)); auto* conv23_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.0.1"); auto* conv23_cv3_0_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_0_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.0.1.0", base_out_channel); auto* conv23_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_0_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.0.1.1"); auto* conv23_cv3_0_2 = network->addConvolutionNd(*conv23_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.0.2.weight"], weightMap["model.23.one2one_cv3.0.2.bias"]); conv23_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_0[] = {conv23_cv2_0_2->getOutput(0), conv23_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_0 = network->addConcatenation(inputTensor23_0, 2); // output1 auto* conv23_cv2_1_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.0"); auto* conv23_cv2_1_1 = convBnSiLU(network, weightMap, *conv23_cv2_1_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.1.1"); nvinfer1::IConvolutionLayer* conv23_cv2_1_2 = network->addConvolutionNd( *conv23_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.1.2.weight"], weightMap["model.23.one2one_cv2.1.2.bias"]); conv23_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv23_cv3_1_0_0 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width(512, gw, max_channels), 3, 1, "model.23.one2one_cv3.1.0.0", get_width(512, gw, max_channels)); auto* conv23_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.0.1"); auto* conv23_cv3_1_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_1_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.1.1.0", base_out_channel); auto* conv23_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_1_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.1.1.1"); auto* conv23_cv3_1_2 = network->addConvolutionNd(*conv23_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.1.2.weight"], weightMap["model.23.one2one_cv3.1.2.bias"]); conv23_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv23_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor23_1[] = {conv23_cv2_1_2->getOutput(0), conv23_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_1 = network->addConcatenation(inputTensor23_1, 2); // output2 auto* conv23_cv2_2_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.0"); auto* conv23_cv2_2_1 = convBnSiLU(network, weightMap, *conv23_cv2_2_0->getOutput(0), base_in_channel, 3, 1, "model.23.one2one_cv2.2.1"); nvinfer1::IConvolutionLayer* conv23_cv2_2_2 = network->addConvolutionNd( *conv23_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv2.2.2.weight"], weightMap["model.23.one2one_cv2.2.2.bias"]); auto* conv23_cv3_2_0_0 = convBnSiLU(network, weightMap, *conv22->getOutput(0), get_width(1024, gw, max_channels), 3, 1, "model.23.one2one_cv3.2.0.0", get_width(1024, gw, max_channels)); auto* conv23_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.0.1"); auto* conv23_cv3_2_1_0 = convBnSiLU(network, weightMap, *conv23_cv3_2_0_1->getOutput(0), base_out_channel, 3, 1, "model.23.one2one_cv3.2.1.0", base_out_channel); auto* conv23_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv23_cv3_2_1_0->getOutput(0), base_out_channel, 1, 1, "model.23.one2one_cv3.2.1.1"); auto* conv23_cv3_2_2 = network->addConvolutionNd(*conv23_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.23.one2one_cv3.2.2.weight"], weightMap["model.23.one2one_cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor23_2[] = {conv23_cv2_2_2->getOutput(0), conv23_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_2 = network->addConcatenation(inputTensor23_2, 2); /******************************************************************************************************* ********************************************* YOLOV10 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::ILayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle23_0 = network->addShuffle(*cat23_0->getOutput(0)); shuffle23_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split23_0_0 = network->addSlice( *shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_0_1 = network->addSlice(*shuffle23_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_0 = DFL(network, weightMap, *split23_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_0[] = {dfl23_0->getOutput(0), split23_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_0 = network->addConcatenation(inputTensor23_dfl_0, 2); cat23_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_1 = network->addShuffle(*cat23_1->getOutput(0)); shuffle23_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split23_1_0 = network->addSlice( *shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_1_1 = network->addSlice(*shuffle23_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_1 = DFL(network, weightMap, *split23_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_1[] = {dfl23_1->getOutput(0), split23_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_1 = network->addConcatenation(inputTensor23_dfl_1, 2); cat23_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle23_2 = network->addShuffle(*cat23_2->getOutput(0)); shuffle23_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split23_2_0 = network->addSlice( *shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split23_2_1 = network->addSlice(*shuffle23_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl23_2 = DFL(network, weightMap, *split23_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.23.dfl.conv.weight"); nvinfer1::ITensor* inputTensor23_dfl_2[] = {dfl23_2->getOutput(0), split23_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat23_dfl_2 = network->addConcatenation(inputTensor23_dfl_2, 2); cat23_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat23_dfl_0, cat23_dfl_1, cat23_dfl_2}, strides, stridesLength); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov10/src/postprocess.cpp ================================================ #include "postprocess.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } void get_topk(std::vector& res, float* output, float conf_thresh, int tokp) { int det_size = sizeof(Detection) / sizeof(float); for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det{}; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); res.push_back(det); } } void batch_topk(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, int topk) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { get_topk(res_batch[i], &output[i * output_size], conf_thresh, topk); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } ================================================ FILE: yolov10/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; // *pdst_c0 = 0.1; // *pdst_c1 = 0.1; // *pdst_c2 = 0.1; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov10/yolov10_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; if (type == "n") { serialized_engine = buildEngineYolov10DetN(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (type == "s") { serialized_engine = buildEngineYolov10DetS(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (type == "m") { serialized_engine = buildEngineYolov10DetM(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (type == "b" || type == "l") { serialized_engine = buildEngineYolov10DetBL(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (type == "x") { serialized_engine = buildEngineYolov10DetX(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else { std::cerr << "Unsupported type!" << std::endl; exit(0); } assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.33; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.33; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.67; gw = 0.75; max_channels = 768; type = "m"; } else if (sub_type[0] == 'b') { gd = 0.67; gw = 1.0; max_channels = 512; type = "b"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.25; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } int main(int argc, char** argv) { // -s ../models/yolov10n.wts ../models/yolov10n.fp32.trt n // -d ../models/yolov10n.fp32.trt ../images cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string type = ""; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov10_det -s [.wts] [.engine] [n/s/m/b/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov10_det -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); if (img.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return -1; } img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize); // output_buffer_host保存前100个值到文件 // std::ofstream out_file("../output.txt"); // for (int i = 0; i < 100; i++) { // out_file << output_buffer_host[i] << std::endl; // } // out_file.close(); std::vector> res_batch; batch_topk(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh); // print results for (size_t j = 0; j < res_batch.size(); j++) { for (size_t k = 0; k < res_batch[j].size(); k++) { std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", " << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", " << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: " << res_batch[j][k].class_id << std::endl; } } // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolov10/yolov10_det_trt.py ================================================ # -*- coding: UTF-8 -*- """ @Author: mpj @Date : 2024/7/24 下午7:11 @version V1.0 """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 DET_NUM = 6 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from Yolov10 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class Yolov10TRT(object): """ description: A Yolov10 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] if self.batch_size != 1: raise ValueError("Only support batch_size=1") size = trt.volume(engine.get_binding_shape(binding)) dtype = engine.get_binding_dtype(binding) dtype = trt.nptype(dtype) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings print('batch_size:', self.batch_size) self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray # pred = np.reshape(output[1:], (-1, 38))[:num, :] pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "yolov8s.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a Yolov10TRT instance yolov8_wrapper = Yolov10TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov12/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov12) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/workspace/shared/TensorRT-8.6.1.6/include) link_directories(/workspace/shared/TensorRT-8.6.1.6/lib) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolo12_det ${PROJECT_SOURCE_DIR}/yolo12_det.cpp ${SRCS}) target_link_libraries(yolo12_det nvinfer) target_link_libraries(yolo12_det cudart) target_link_libraries(yolo12_det myplugins) target_link_libraries(yolo12_det ${OpenCV_LIBS}) ================================================ FILE: yolov12/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32 if m_type in ['detect', 'seg', 'pose', 'obb']: anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] delattr(model.model[-1], 'anchors') model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov12/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" std::map loadWeights(const std::string file); nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb); nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool c3k, bool shortcut, float e, std::string lname); nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname); nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool a2, int area, bool residual, float mlp_ratio, float e, int g, bool shortcut, std::string lname); nvinfer1::ILayer* ABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area, std::string lname); nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area, std::string lname); ================================================ FILE: yolov12/include/config.h ================================================ #define USE_FP16 // #define USE_FP32 // #define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static char* kProtoTensorName = "proto"; const static int kNumClass = 80; const static int kPoseNumClass = 1; const static int kNumberOfPoints = 17; // number of keypoints total // obb model's number of classes constexpr static int kObbNumClass = 15; const static int kObbNe = 1; // number of extra parameters const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static int kObbInputH = 1024; const static int kObbInputW = 1024; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static float kConfThreshKeypoints = 0.5f; // keypoints confidence const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; //Quantization input image folder path const static char* kInputQuantizationFolder = "./coco_calib"; // Classfication model's number of classes constexpr static int kClsNumClass = 1000; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; ================================================ FILE: yolov12/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov12/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov12/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov12/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolo12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); ================================================ FILE: yolov12/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" // Preprocessing functions cv::Rect get_rect(cv::Mat& img, float bbox[4]); // Processing functions void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); // NMS functions void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms_obb(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); // CUDA-related functions void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); // Drawing functions void draw_bbox(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); ================================================ FILE: yolov12/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov12/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; float mask[32]; float keypoints[kNumberOfPoints * 3]; // 17*3 keypoints float angle; // obb angle }; struct AffineMatrix { float value[6]; }; const int bbox_element = sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolov12/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); // std::cout << "Found file: " << cur_file_name << std::endl; file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov12/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength) { mClassCount = classCount; mNumberofpoints = numberofpoints; mConfthreshkeypoints = confthreshkeypoints; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; is_pose_ = is_pose; is_obb_ = is_obb; } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mNumberofpoints); read(d, mConfthreshkeypoints); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } read(d, is_segmentation_); read(d, is_pose_); read(d, is_obb_); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mNumberofpoints); write(d, mConfthreshkeypoints); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } write(d, is_segmentation_); write(d, is_pose_); write(d, is_obb_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, bool is_segmentation, bool is_pose, bool is_obb) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; const int N_kpts = nk; int total_grid = grid_h * grid_w; int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0); int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; if (is_segmentation) { for (int k = 0; k < 32; ++k) { det->mask[k] = curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid]; } } if (is_pose) { for (int kpt = 0; kpt < N_kpts; kpt++) { int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid; int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid; int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid; float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; bool is_within_bbox = kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; if (kpt_confidence < confkeypoints || !is_within_bbox) { det->keypoints[kpt * 3] = -1; det->keypoints[kpt * 3 + 1] = -1; det->keypoints[kpt * 3 + 2] = -1; } else { det->keypoints[kpt * 3] = kpt_x; det->keypoints[kpt * 3 + 1] = kpt_y; det->keypoints[kpt * 3 + 2] = kpt_confidence; } } } if (is_obb) { double pi = CV_PI; auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + 0) * total_grid]; auto angle = (sigmoid(angle_inx) - 0.25f) * pi; auto cos1 = cos(angle); auto sin1 = sin(angle); auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2; auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2; auto x = xf * cos1 - yf * sin1; auto y = xf * sin1 + yf * cos1; float cx = (col + 0.5f + x) * stride; float cy = (row + 0.5f + y) * stride; float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride; float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride; det->bbox[0] = cx; det->bbox[1] = cy; det->bbox[2] = w1; det->bbox[3] = h1; det->angle = angle; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; // const int maxGrids = mStridesLength; // int grids[maxGrids][2]; // for (int i = 0; i < maxGrids; ++i) { // grids[i][0] = mYoloV8netHeight / mStrides[i]; // grids[i][1] = mYoloV8NetWidth / mStrides[i]; // } int maxGrids = mStridesLength; int flatGridsLen = 2 * maxGrids; int* flatGrids = new int[flatGridsLen]; for (int i = 0; i < maxGrids; ++i) { flatGrids[2 * i] = mYoloV8netHeight / mStrides[i]; flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { // Access the elements of the original 2D array from the flattened 1D array int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements if (numElem < mThreadCount) // Adjust the thread count if needed mThreadCount = numElem; // The CUDA kernel call remains unchanged CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_); } delete[] flatGrids; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int netinfo_count = 9; int class_count = combinedInfo[0]; int numberofpoints = combinedInfo[1]; float confthreshkeypoints = combinedInfo[2]; int input_w = combinedInfo[3]; int input_h = combinedInfo[4]; int max_output_object_count = combinedInfo[5]; bool is_segmentation = combinedInfo[6]; bool is_pose = combinedInfo[7]; bool is_obb = combinedInfo[8]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov12/plugin/yololayer.h ================================================ #pragma once #include #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; int mNumberofpoints; float mConfthreshkeypoints; int mYoloV8NetWidth; int mYoloV8netHeight; int mMaxOutObject; bool is_segmentation_; bool is_pose_; bool is_obb_; int* mStrides; int mStridesLength; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov12/readme.md ================================================ ## Introduction Yolo12 model supports TensorRT-8. Training code [link](https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.38.zip) ## Environment * cuda 11.8 * cudnn 8.9.1.23 * tensorrt 8.6.1.6 * opencv 4.8.0 * ultralytics 8.3.0 ## Support * [x] YOLO12-det support FP32/FP16 and C++ API ## Config * Choose the YOLO12 sub-model n/s/m/l/x from command line arguments. * Other configs please check [src/config.h](src/config.h) ## Build and Run 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # Download ultralytics wget https://github.com/ultralytics/ultralytics/archive/refs/tags/v8.3.119.zip -O ultralytics-8.3.119.zip # Unzip ultralytics unzip ultralytics-8.3.119.zip cd ultralytics-8.3.119 # Download models wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12n.pt -O yolo12n.pt # to download other models, replace 'yolo12n.pt' with 'yolo12s.pt', 'yolo12m.pt', 'yolo12l.pt' or 'yolo12x.pt' # Generate .wts cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py . python gen_wts.py -w yolo12n.pt -o yolo12n.wts -t detect # A file 'yolo12n.wts' will be generated. ``` 2. build tensorrtx/yolov12 and run ```shell cd [PATH-TO-TENSORRTX]/yolov12 mkdir build cd build cmake .. make ``` ### Detection ```shell cp [PATH-TO-ultralytics]/yolo12n.wts . # Build and serialize TensorRT engine ./yolo12_det -s yolo12n.wts yolo12n.engine [n/s/m/l/x] # Run inference ./yolo12_det -d yolo12n.engine ../images [c/g] # results saved in build directory ``` ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov12/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "model.h" #include "yololayer.h" std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; } return WeightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, std::vector k1, std::vector k2, float e, std::string lname) { int c_ = (int)((float)c2 * e); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2"); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return conv2; } nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname) { int c_ = c1 / 2; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool3 = network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) { nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); softmax->setAxes(1 << 1); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const int netinfo_count = 9; // Assuming the first 5 elements are for netinfo as per existing code. const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. std::vector combinedInfo(total_count); int class_num = kNumClass; if (is_pose) class_num = kPoseNumClass; else if (is_obb) class_num = kObbNumClass; int input_w = kInputW; if (is_obb) input_w = kObbInputW; int input_h = kInputH; if (is_obb) input_h = kObbInputH; // Fill in the first 5 elements as per existing netinfo. combinedInfo[0] = class_num; combinedInfo[1] = kNumberOfPoints; combinedInfo[2] = kConfThreshKeypoints; combinedInfo[3] = input_w; combinedInfo[4] = input_h; combinedInfo[5] = kMaxNumOutputBbox; combinedInfo[6] = is_segmentation; combinedInfo[7] = is_pose; combinedInfo[8] = is_obb; // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); // Now let's create the PluginField object to hold this combined information. nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); // Create the PluginFieldCollection to hold the PluginField object. nvinfer1::PluginFieldCollection pluginFieldCollection; pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array pluginFieldCollection.fields = &pluginField; // Create the plugin object using the PluginFieldCollection. nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // We assume that the plugin is to be added onto the network. // Prepare input tensors for the YOLO Layer. std::vector inputTensors; for (auto det : dets) { inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. } // Add the plugin to the network using the prepared input tensors. nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); return yoloLayer; // Return the added YOLO layer. } static nvinfer1::ILayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, std::vector k1, std::vector k2, float e, std::string lname) { int c_ = (int)((float)c2 * e); auto cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); auto cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2"); nvinfer1::ITensor* y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, k1, k2, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 2); auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3"); return cv3; } nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool c3k, bool shortcut, float e, std::string lname) { int c_ = (float)c2 * e; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0}, nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* y1 = split2->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b; if (c3k) { b = C3k(network, weightMap, *y1, c_, c_, 2, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i)); } else { b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m." + std::to_string(i)); } y1 = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } static nvinfer1::ILayer* convBn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, std::string lname, int g = 1) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv; if (lname.find(".pe") != std::string::npos) { nvinfer1::Weights conv_bias = weightMap[lname + ".conv.bias"]; conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], conv_bias); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); conv->setName((lname + ".conv").c_str()); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); bn->setName((lname + ".bn").c_str()); return bn; } else { conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); int p = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); conv->setName((lname + ".conv").c_str()); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); bn->setName((lname + ".bn").c_str()); return bn; } } static nvinfer1::ILayer* Attention(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float attn_ratio, std::string lname) { int head_dim = dim / num_heads; int key_dim = head_dim * attn_ratio; float scale = pow(key_dim, -0.5); int nh_kd = key_dim * num_heads; int h = dim + nh_kd * 2; auto d = input.getDimensions(); int B = d.d[0]; int H = d.d[2]; int W = d.d[3]; int N = H * W; auto* qkv = convBn(network, weightMap, input, h, 1, 1, lname + ".qkv"); // qkv.view(B, self.num_heads, -1, N) auto shuffle = network->addShuffle(*qkv->getOutput(0)); shuffle->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, -1, N}); // q, k, v = .split([self.key_dim, self.key_dim, self.head_dim], dim=2) auto d1 = shuffle->getOutput(0)->getDimensions(); auto q = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto k = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], key_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto v = network->addSlice(*shuffle->getOutput(0), nvinfer1::Dims4{0, 0, key_dim * 2, 0}, nvinfer1::Dims4{d1.d[0], d1.d[1], head_dim, d1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // attn = ((q.transpose(-2, -1) @ k) * self.scale) auto qT = network->addShuffle(*q->getOutput(0)); qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0), nvinfer1::MatrixOperation::kNONE); // There are not many memory leaks, and I will change it when I have time float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; nvinfer1::IScaleLayer* scaleLayer = network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); // attn = attn.softmax(dim=-1) nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*scaleLayer->getOutput(0)); softmax->setAxes(1 << 3); // x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + self.pe(v.reshape(B, -1, H, W)) auto attnT = network->addShuffle(*softmax->getOutput(0)); attnT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul2 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attnT->getOutput(0), nvinfer1::MatrixOperation::kNONE); auto reshape = network->addShuffle(*matmul2->getOutput(0)); reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); auto v_reshape = network->addShuffle(*v->getOutput(0)); v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, -1, H, W}); // self.pe = Conv(dim, dim, 3, 1, g=dim, act=False) auto pe = convBn(network, weightMap, *v_reshape->getOutput(0), dim, 3, 1, lname + ".pe", dim); auto sum = network->addElementWise(*reshape->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // x = self.proj(x) // self.proj = Conv(dim, dim, 1, act=False) auto proj = convBn(network, weightMap, *sum->getOutput(0), dim, 1, 1, lname + ".proj"); return proj; } static nvinfer1::ILayer* PSABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, float attn_ratio, int num_heads, bool shortcut, std::string lname) { // x = x + self.attn(x) if self.add else self.attn(x) auto attn = Attention(network, weightMap, input, dim, num_heads, attn_ratio, lname + ".attn"); nvinfer1::ILayer* shortcut_layer = nullptr; if (shortcut) { shortcut_layer = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); } else { shortcut_layer = attn; } // self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, act=False)) // x = x + self.ffn(x) if self.add else self.ffn(x) auto ffn0 = convBnSiLU(network, weightMap, *shortcut_layer->getOutput(0), dim * 2, {1, 1}, 1, lname + ".ffn.0"); auto ffn1 = convBn(network, weightMap, *ffn0->getOutput(0), dim, 1, 1, lname + ".ffn.1"); if (shortcut) { return network->addElementWise(*shortcut_layer->getOutput(0), *ffn1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); } else { return ffn1; } } nvinfer1::ILayer* C2PSA(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, float e, std::string lname) { assert(network != nullptr); int c = c1 * e; // cv1 branch nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c, {1, 1}, 1, lname + ".cv1"); nvinfer1::ITensor* cv1_out = conv1->getOutput(0); // Split the output of cv1 into two tensors nvinfer1::Dims dims = cv1_out->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0}, nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); // Create y1 bottleneck sequence nvinfer1::ITensor* y = split2->getOutput(0); for (int i = 0; i < n; ++i) { auto* bottleneck_layer = PSABlock(network, weightMap, *y, c, 0.5, c / 64, true, lname + ".m." + std::to_string(i)); y = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck } // Concatenate y1 with the second split of cv1 nvinfer1::ITensor* concatInputs[2] = {split1->getOutput(0), y}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2); // cv2 to produce the final output nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setNbGroups(ch); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool a2, int area, bool residual, float mlp_ratio, float e, int g, bool shortcut, std::string lname) { int c = (int)(((float)c2) * e); int num_heads = c / 32 * 2; //assert(c % 32 == 0 && "c2 should be divisible by 32"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c * 2, {1, 1}, 1, lname + ".cv1"); if (a2) { nvinfer1::ILayer* ablock1 = ABlock(network, weightMap, *conv1->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.0.0"); nvinfer1::ILayer* ablock2 = ABlock(network, weightMap, *ablock1->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.0.1"); nvinfer1::ILayer* ablock3 = ABlock(network, weightMap, *ablock2->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.1.0"); nvinfer1::ILayer* ablock4 = ABlock(network, weightMap, *ablock3->getOutput(0), c, num_heads, mlp_ratio, area, lname + ".m.1.1"); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), ablock2->getOutput(0), ablock4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 3); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } else { nvinfer1::ILayer* c3k_ = C3k(network, weightMap, *conv1->getOutput(0), c * 2, c * 2, 2, shortcut, {3, 3}, {3, 3}, 0.5, lname + ".m.0"); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), c3k_->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return conv2; } } nvinfer1::ILayer* ABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area, std::string lname) { int mlp_hidden_dim = (int)(dim * mlp_ratio); nvinfer1::ILayer* attn = AAttn(network, weightMap, input, dim, num_heads, mlp_ratio, area, lname + ".attn"); nvinfer1::IElementWiseLayer* sum = network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); //mlp nvinfer1::IElementWiseLayer* mlp1 = convBnSiLU(network, weightMap, *sum->getOutput(0), mlp_hidden_dim * 2, {1, 1}, 1, lname + ".mlp.0"); nvinfer1::ILayer* mlp2 = convBn(network, weightMap, *mlp1->getOutput(0), dim * 2, 1, 1, lname + ".mlp.1"); nvinfer1::IElementWiseLayer* sum2 = network->addElementWise(*sum->getOutput(0), *mlp2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return sum2; } nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, float mlp_ratio, int area, std::string lname) { int head_dim = (int)(dim / num_heads); int all_head_dim = head_dim * num_heads; //TODO: SCALE IS STATIC, CONVERT TO DYNAMIC! float scale = 0.176777; auto dims = input.getDimensions(); int B = dims.d[0]; int C = dims.d[1]; int H = dims.d[2]; int W = dims.d[3]; int N = H * W; auto* qkv = convBn(network, weightMap, input, all_head_dim * 3 * 2, 1, 1, lname + ".qkv"); auto* reshape = network->addShuffle(*qkv->getOutput(0)); reshape->setReshapeDimensions(nvinfer1::Dims3{B, -1, N}); reshape->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); if (area > 1) { B = B * area; N = (H * W) / area; } auto* reshape1 = network->addShuffle(*reshape->getOutput(0)); reshape1->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim * 3 * 2}); reshape1->setSecondTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::ISliceLayer* q = network->addSlice( *reshape1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{reshape1->getOutput(0)->getDimensions().d[0], reshape1->getOutput(0)->getDimensions().d[1], reshape1->getOutput(0)->getDimensions().d[2] / 3, reshape1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* k = network->addSlice( *reshape1->getOutput(0), nvinfer1::Dims4{0, 0, reshape1->getOutput(0)->getDimensions().d[2] / 3, 0}, nvinfer1::Dims4{reshape1->getOutput(0)->getDimensions().d[0], reshape1->getOutput(0)->getDimensions().d[1], reshape1->getOutput(0)->getDimensions().d[2] / 3, reshape1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* v = network->addSlice( *reshape1->getOutput(0), nvinfer1::Dims4{0, 0, 2 * reshape1->getOutput(0)->getDimensions().d[2] / 3, 0}, nvinfer1::Dims4{reshape1->getOutput(0)->getDimensions().d[0], reshape1->getOutput(0)->getDimensions().d[1], reshape1->getOutput(0)->getDimensions().d[2] / 3, reshape1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); auto* qT = network->addShuffle(*q->getOutput(0)); qT->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul = network->addMatrixMultiply(*qT->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k->getOutput(0), nvinfer1::MatrixOperation::kNONE); float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; nvinfer1::IScaleLayer* mul = network->addScale(*matmul->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); auto* softmax = network->addSoftMax(*mul->getOutput(0)); softmax->setAxes(1 << 3); auto transpose3 = network->addShuffle(*softmax->getOutput(0)); transpose3->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); auto matmul1 = network->addMatrixMultiply(*v->getOutput(0), nvinfer1::MatrixOperation::kNONE, *transpose3->getOutput(0), nvinfer1::MatrixOperation::kNONE); auto transpose4 = network->addShuffle(*matmul1->getOutput(0)); transpose4->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2}); if (area > 1) { B = B / area; N = N * area; } auto* reshape3 = network->addShuffle(*transpose4->getOutput(0)); reshape3->setReshapeDimensions(nvinfer1::Dims4{B, H, W, -1}); auto* transpose6 = network->addShuffle(*reshape3->getOutput(0)); transpose6->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2}); auto transpose5 = network->addShuffle(*v->getOutput(0)); transpose5->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2}); auto* reshape4 = network->addShuffle(*transpose5->getOutput(0)); reshape4->setReshapeDimensions(nvinfer1::Dims4{B, H, W, C}); //reshape4->setSecondTranspose(nvinfer1::Permutation{0, 3, 1, 2}); auto* transpose7 = network->addShuffle(*reshape4->getOutput(0)); transpose7->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2}); auto* pe = convBn(network, weightMap, *transpose7->getOutput(0), all_head_dim * 2, 7, 1, lname + ".pe", all_head_dim * 2); auto* sum = network->addElementWise(*pe->getOutput(0), *transpose6->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); auto* proj = convBn(network, weightMap, *sum->getOutput(0), all_head_dim * 2, 1, 1, lname + ".proj"); return proj; } ================================================ FILE: yolov12/src/model.cpp ================================================ #include #include #include "block.h" //#include "calibrator.h" #include "config.h" #include "model.h" static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = std::min(x, max_channels); channel = int(ceil((channel * gw) / divisor)) * divisor; return channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } nvinfer1::IHostMemory* buildEngineYolo12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); // nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); /******************************************************************************************************* ****************************************** YOLO12 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLO12 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1"); bool c3k = false; if (type == "m" || type == "l" || type == "x") { c3k = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(256, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3"); nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(512, gw, max_channels), get_depth(2, gd), c3k, true, 0.25, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), 4, true, 4, true, 2.0, 0.25, 1, true, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 4, true, 1, true, 2.0, 0.25, 1, true, "model.8"); /******************************************************************************************************* ********************************************* YOLO12 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample9 = network->addResize(*conv8->getOutput(0)); assert(upsample9); upsample9->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample9->setScales(scale, 4); nvinfer1::ITensor* inputTensors10[] = {upsample9->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat10 = network->addConcatenation(inputTensors10, 2); nvinfer1::ILayer* conv11 = A2C2f(network, weightMap, *cat10->getOutput(0), get_width(1024, gw, max_channels), get_width(512, gw, max_channels), 4, false, 1, true, 2.0, 0.25, 1, true, "model.11"); nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0)); assert(upsample12); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 4); nvinfer1::ITensor* inputTensors13[] = {upsample12->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat13 = network->addConcatenation(inputTensors13, 2); nvinfer1::ILayer* conv14 = A2C2f(network, weightMap, *cat13->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), 4, false, 1, true, 2.0, 0.25, 1, true, "model.14"); nvinfer1::IElementWiseLayer* conv15 = convBnSiLU(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.15"); nvinfer1::ITensor* inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)}; nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2); nvinfer1::ILayer* conv17 = A2C2f(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), 4, false, 1, true, 2.0, 0.25, 1, true, "model.17"); nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.18"); nvinfer1::ITensor* inputTensors19[] = {conv18->getOutput(0), conv8->getOutput(0)}; nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensors19, 2); nvinfer1::IElementWiseLayer* conv20 = C3K2(network, weightMap, *cat19->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(2, gd), true, true, 0.5, "model.20"); /******************************************************************************************************* ********************************************* YOLO12 OUTPUT ****************************************** *******************************************************************************************************/ int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); // output 0 nvinfer1::IElementWiseLayer* conv21_cv2_0_0 = convBnSiLU(network, weightMap, *conv14->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.0"); nvinfer1::IElementWiseLayer* conv21_cv2_0_1 = convBnSiLU(network, weightMap, *conv21_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.1"); nvinfer1::IConvolutionLayer* conv21_cv2_0_2 = network->addConvolutionNd(*conv21_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.0.2.weight"], weightMap["model.21.cv2.0.2.bias"]); conv21_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv21_cv3_0_0_0 = DWConv(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.21.cv3.0.0.0"); auto* conv21_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.0.1"); auto* conv21_cv3_0_1_0 = DWConv(network, weightMap, *conv21_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.0.1.0"); auto* conv21_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_0_2 = network->addConvolutionNd(*conv21_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.0.2.weight"], weightMap["model.21.cv3.0.2.bias"]); conv21_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor21_0[] = {conv21_cv2_0_2->getOutput(0), conv21_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_0 = network->addConcatenation(inputTensor21_0, 2); //output 1 nvinfer1::IElementWiseLayer* conv21_cv2_1_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.0"); nvinfer1::IElementWiseLayer* conv21_cv2_1_1 = convBnSiLU(network, weightMap, *conv21_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.1"); nvinfer1::IConvolutionLayer* conv21_cv2_1_2 = network->addConvolutionNd(*conv21_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.1.2.weight"], weightMap["model.21.cv2.1.2.bias"]); conv21_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv21_cv3_1_0_0 = DWConv(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.21.cv3.1.0.0"); auto* conv21_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.0.1"); auto* conv21_cv3_1_1_0 = DWConv(network, weightMap, *conv21_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.1.1.0"); auto* conv21_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_1_2 = network->addConvolutionNd(*conv21_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.1.2.weight"], weightMap["model.21.cv3.1.2.bias"]); conv21_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor21_1[] = {conv21_cv2_1_2->getOutput(0), conv21_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_1 = network->addConcatenation(inputTensor21_1, 2); //output 2 nvinfer1::IElementWiseLayer* conv21_cv2_2_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.0"); nvinfer1::IElementWiseLayer* conv21_cv2_2_1 = convBnSiLU(network, weightMap, *conv21_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.1"); nvinfer1::IConvolutionLayer* conv21_cv2_2_2 = network->addConvolutionNd(*conv21_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.2.2.weight"], weightMap["model.21.cv2.2.2.bias"]); conv21_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); auto* conv21_cv3_2_0_0 = DWConv(network, weightMap, *conv20->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.21.cv3.2.0.0"); auto* conv21_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.0.1"); auto* conv21_cv3_2_1_0 = DWConv(network, weightMap, *conv21_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.2.1.0"); auto* conv21_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_2_2 = network->addConvolutionNd(*conv21_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.2.2.weight"], weightMap["model.21.cv3.2.2.bias"]); conv21_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor21_2[] = {conv21_cv2_2_2->getOutput(0), conv21_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_2 = network->addConcatenation(inputTensor21_2, 2); /******************************************************************************************************* ********************************************* YOLO12 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle21_0 = network->addShuffle(*cat21_0->getOutput(0)); shuffle21_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split21_0_0 = network->addSlice( *shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_0_1 = network->addSlice(*shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_0 = DFL(network, weightMap, *split21_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.21.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl21_0->getOutput(0), split21_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2); cat22_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle21_1 = network->addShuffle(*cat21_1->getOutput(0)); shuffle21_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split21_1_0 = network->addSlice( *shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_1_1 = network->addSlice(*shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_1 = DFL(network, weightMap, *split21_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.21.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl21_1->getOutput(0), split21_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2); cat22_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle21_2 = network->addShuffle(*cat21_2->getOutput(0)); shuffle21_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split21_2_0 = network->addSlice( *shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_2_1 = network->addSlice(*shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_2 = DFL(network, weightMap, *split21_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.21.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl21_2->getOutput(0), split21_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2); cat22_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); config->setFlag(nvinfer1::BuilderFlag::kFP16); std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov12/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kObbInputW / (img.cols * 1.0); float r_h = kObbInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kObbInputH - r_w * img.rows) / 2; b = bbox[3] - (kObbInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kObbInputW - r_h * img.cols) / 2; r = bbox[2] - (kObbInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; // lmk[i + 2] } } else { l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; // lmk[i + 2] } } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), (std::min)(lbox[2], rbox[2]), (std::max)(lbox[1], rbox[1]), (std::min)(lbox[3], rbox[3]), }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; return interBoxS / unionBoxS; } static bool cmp(const Detection& a, const Detection& b) { if (a.conf == b.conf) { return a.bbox[0] < b.bbox[0]; } return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4])) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; res.push_back(det); } } } void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch) { const std::vector> skeleton_pairs = { {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); for (int k = 0; k < kNumberOfPoints * 3; k += 3) { if (res[j].keypoints[k + 2] > 0.5) { cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, cv::Scalar(0, 0x27, 0xC1), -1); } } for (const auto& bone : skeleton_pairs) { int kp1_idx = bone.first * 3; int kp2_idx = bone.second * 3; if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); } } } } } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; det.angle = decode_ptr_host[basic_pos + 7]; res.push_back(det); } } } void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } std::tuple convariance_matrix(Detection res) { float w = res.bbox[2]; float h = res.bbox[3]; float a = w * w / 12.0; float b = h * h / 12.0; float c = res.angle; float cos_r = std::cos(c); float sin_r = std::sin(c); float cos_r2 = cos_r * cos_r; float sin_r2 = sin_r * sin_r; float a_val = a * cos_r2 + b * sin_r2; float b_val = a * sin_r2 + b * cos_r2; float c_val = (a - b) * cos_r * sin_r; return std::make_tuple(a_val, b_val, c_val); } static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; std::tuple matrix1 = {a1, b1, c1}; std::tuple matrix2 = {a2, b2, c2}; matrix1 = convariance_matrix(res1); matrix2 = convariance_matrix(res2); a1 = std::get<0>(matrix1); b1 = std::get<1>(matrix1); c1 = std::get<2>(matrix1); a2 = std::get<0>(matrix2); b2 = std::get<1>(matrix2); c2 = std::get<2>(matrix2); float x1 = res1.bbox[0], y1 = res1.bbox[1]; float x2 = res2.bbox[0], y2 = res2.bbox[1]; float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t3 = std::log( ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) / (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = std::max(std::min(bd, 100.0f), eps); float hd = std::sqrt(1.0 - std::exp(-bd) + eps); return 1 - hd; } void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (probiou(item, dets[n]) >= nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms_obb(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } static std::vector get_corner(cv::Mat& img, const Detection& box) { float cos_value, sin_value; // Calculate center point and width/height float x1 = box.bbox[0]; float y1 = box.bbox[1]; float w = box.bbox[2]; float h = box.bbox[3]; float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees // Print original angle std::cout << "Original angle: " << angle << std::endl; // Swap width and height if height is greater than or equal to width if (h >= w) { std::swap(w, h); angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180) } // Ensure the angle is between 0 and 180 degrees if (angle < 0) { angle += 360.0f; // Convert to positive value } if (angle > 180.0f) { angle -= 180.0f; // Subtract 180 from angles greater than 180 } // Print adjusted angle std::cout << "Adjusted angle: " << angle << std::endl; // Convert to normal angle value float normal_angle = fmod(angle, 180.0f); if (normal_angle < 0) { normal_angle += 180.0f; // Ensure it's a positive value } // Print normal angle value std::cout << "Normal angle: " << normal_angle << std::endl; cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians sin_value = std::sin(angle * CV_PI / 180.0f); // Calculate each corner point float l = x1 - w / 2; // Left boundary float r = x1 + w / 2; // Right boundary float t = y1 - h / 2; // Top boundary float b = y1 + h / 2; // Bottom boundary // Use get_rect function to scale the coordinates float bbox[4] = {l, t, r, b}; cv::Rect rect = get_rect_obb(img, bbox); float x_ = (rect.x + rect.x + rect.width) / 2; // Center x float y_ = (rect.y + rect.y + rect.height) / 2; // Center y float width = rect.width; // Width float height = rect.height; // Height // Calculate each corner point std::vector corner_points(4); float vec1x = width / 2 * cos_value; float vec1y = width / 2 * sin_value; float vec2x = -height / 2 * sin_value; float vec2y = height / 2 * cos_value; corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner corner_points[2] = cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner // Check and adjust corner points to ensure the rectangle is parallel to image boundaries for (auto& point : corner_points) { point.x = std::max(0, std::min(point.x, img.cols - 1)); point.y = std::max(0, std::min(point.y, img.rows - 1)); } return corner_points; } void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; auto& img = img_batch[i]; for (auto& obj : res) { auto color = colors[(int)obj.class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); auto corner_points = get_corner(img, obj); cv::polylines(img, std::vector>{corner_points}, true, bgr, 1); auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf)); cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr); int width = textsize.width; int height = textsize.height; bool outside = (corner_points[0].y - height >= 3) ? true : false; cv::Point p1(corner_points[0].x, corner_points[0].y), p2; p2.x = corner_points[0].x + width; if (outside) { p2.y = corner_points[0].y - height - 3; } else { p2.y = corner_points[0].y + height + 3; } cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA); cv::putText( img, text, cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)), 0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA); } } } ================================================ FILE: yolov12/src/postprocess.cu ================================================ // // Created by lindsay on 23-7-17. // #include "postprocess.h" #include "types.h" static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; //[center_x center_y w h conf class_id mask[32] keypoints[51] angle] float cx = pitem[0]; float cy = pitem[1]; float width = pitem[2]; float height = pitem[3]; float label = pitem[5]; float angle = pitem[89]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = cx; *pout_item++ = cy; *pout_item++ = width; *pout_item++ = height; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore *pout_item++ = angle; } static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; float left = pitem[0]; float top = pitem[1]; float right = pitem[2]; float bottom = pitem[3]; float label = pitem[5]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) { float a_val = w * w / 12.0f; float b_val = h * h / 12.0f; float cos_r = cosf(r); float sin_r = sinf(r); a = a_val * cos_r * cos_r + b_val * sin_r * sin_r; b = a_val * sin_r * sin_r + b_val * cos_r * cos_r; c = (a_val - b_val) * sin_r * cos_r; } static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2, float h2, float r2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; convariance_matrix(w1, h1, r1, a1, b1, c1); convariance_matrix(w2, h2, r2, a2, b2, c2); float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) / (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = fmaxf(fminf(bd, 100.0f), eps); float hd = sqrtf(1.0f - expf(-bd) + eps); return 1 - hd; } static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1], pitem[2], pitem[3], pitem[7]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel<<>>(parray, max_objects, nms_threshold); } void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel_obb<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel_obb<<>>(parray, max_objects, nms_threshold); } ================================================ FILE: yolov12/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov12/yolo12_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolo12Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { // yolo12_det -s ../models/yolo12n.wts ../models/yolo12n.fp32.trt n // yolo12_det -d ../models/yolo12n.fp32.trt ../images c cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string cuda_post_process; std::string type; int model_bboxes; float gd = 0, gw = 0; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo12_det -s [.wts] [.engine] [n/s/m/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolo12_det -d [.engine] ../images [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); // 保存output_buffer_host的前100个值,一行一个 // std::ofstream out("../models/output.txt"); // for (int j = 0; j < 100; j++) { // out << output_buffer_host[j] << std::endl; // } // out.close(); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov12-tubro/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov12) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) # Set CUDA compiler - use find_package or environment variable if(NOT DEFINED CMAKE_CUDA_COMPILER) find_program( CMAKE_CUDA_COMPILER nvcc HINTS ENV CUDA_HOME PATH_SUFFIXES bin) endif() enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt # Use CUDA_TOOLKIT_ROOT_DIR or CUDA_HOME environment variable if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR) if(DEFINED ENV{CUDA_HOME}) set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_HOME}) else() set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda") endif() endif() # Use TENSORRT_DIR environment variable or default path if(NOT DEFINED TENSORRT_DIR) if(DEFINED ENV{TENSORRT_DIR}) set(TENSORRT_DIR $ENV{TENSORRT_DIR}) else() set(TENSORRT_DIR "/opt/TensorRT-8.6.1.6") endif() endif() if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories( ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include) link_directories( ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) # tensorrt include_directories(${TENSORRT_DIR}/include) link_directories(${TENSORRT_DIR}/lib) endif() add_library( myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file( GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable( yolov12-det ${PROJECT_SOURCE_DIR}/yolov12_det.cpp ${SRCS}) target_link_libraries( yolov12-det nvinfer cudart myplugins ${OpenCV_LIBS}) add_executable( yolov12-seg ${PROJECT_SOURCE_DIR}/yolov12_seg.cpp ${SRCS}) target_link_libraries( yolov12-seg nvinfer cudart myplugins ${OpenCV_LIBS}) add_executable( yolov12-cls ${PROJECT_SOURCE_DIR}/yolov12_cls.cpp ${SRCS}) target_link_libraries( yolov12-cls nvinfer cudart myplugins ${OpenCV_LIBS}) ================================================ FILE: yolov12-tubro/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32 if m_type in ['detect', 'seg', 'pose', 'obb']: anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] delattr(model.model[-1], 'anchors') model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov12-tubro/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" using namespace std; std::map loadWeights(const std::string file); nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int p = 0, int g = 1, int d = 1); nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c_out, std::string lname, int k = 1, int s = 1, int padding = 0, int g = 1, bool act = true); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb); nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, int n = 1, bool shortcut = true, int g = 1, float e = 0.5, int k = 3); nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool c3k = false, float e = 0.5, int g = 1, bool shortcut = true); nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area = 1); nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, float mlp_ratio = 1.2, int area = 1); nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2 = true, int area = 1, bool residual = false, float mlp_ratio = 2.0, float e = 0.5, int g = 1, bool shortcut = true); void cout_dim(nvinfer1::ITensor& input); ================================================ FILE: yolov12-tubro/include/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolov12-tubro/include/config.h ================================================ #define USE_FP16 // #define USE_FP32 // #define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static char* kProtoTensorName = "proto"; const static int kNumClass = 4; const static int kPoseNumClass = 1; const static int kNumberOfPoints = 17; // number of keypoints total // obb model's number of classes constexpr static int kObbNumClass = 15; const static int kObbNe = 1; // number of extra parameters const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static int kObbInputH = 1024; const static int kObbInputW = 1024; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static float kConfThreshKeypoints = 0.5f; // keypoints confidence const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; //Quantization input image folder path const static char* kInputQuantizationFolder = "./coco_calib"; // Classfication model's number of classes constexpr static int kClsNumClass = 5; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; ================================================ FILE: yolov12-tubro/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov12-tubro/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov12-tubro/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov12-tubro/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolov12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolov12Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolov12Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, std::string& type, int max_channels); ================================================ FILE: yolov12-tubro/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" // Preprocessing functions cv::Rect get_rect(cv::Mat& img, float bbox[4]); // Processing functions void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); // NMS functions void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms_obb(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); // CUDA-related functions void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); // Drawing functions void draw_bbox(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); ================================================ FILE: yolov12-tubro/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov12-tubro/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; float mask[32]; float keypoints[kNumberOfPoints * 3]; // 17*3 keypoints float angle; // obb angle }; struct AffineMatrix { float value[6]; }; const int bbox_element = sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolov12-tubro/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); // std::cout << "Found file: " << cur_file_name << std::endl; file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov12-tubro/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength) { mClassCount = classCount; mNumberofpoints = numberofpoints; mConfthreshkeypoints = confthreshkeypoints; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; is_pose_ = is_pose; is_obb_ = is_obb; } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mNumberofpoints); read(d, mConfthreshkeypoints); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } read(d, is_segmentation_); read(d, is_pose_); read(d, is_obb_); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mNumberofpoints); write(d, mConfthreshkeypoints); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } write(d, is_segmentation_); write(d, is_pose_); write(d, is_obb_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, bool is_segmentation, bool is_pose, bool is_obb) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; const int N_kpts = nk; int total_grid = grid_h * grid_w; int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0); int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; if (is_segmentation) { for (int k = 0; k < 32; ++k) { det->mask[k] = curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid]; } } if (is_pose) { for (int kpt = 0; kpt < N_kpts; kpt++) { int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid; int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid; int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid; float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; bool is_within_bbox = kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; if (kpt_confidence < confkeypoints || !is_within_bbox) { det->keypoints[kpt * 3] = -1; det->keypoints[kpt * 3 + 1] = -1; det->keypoints[kpt * 3 + 2] = -1; } else { det->keypoints[kpt * 3] = kpt_x; det->keypoints[kpt * 3 + 1] = kpt_y; det->keypoints[kpt * 3 + 2] = kpt_confidence; } } } if (is_obb) { double pi = CV_PI; auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + 0) * total_grid]; auto angle = (sigmoid(angle_inx) - 0.25f) * pi; auto cos1 = cos(angle); auto sin1 = sin(angle); auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2; auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2; auto x = xf * cos1 - yf * sin1; auto y = xf * sin1 + yf * cos1; float cx = (col + 0.5f + x) * stride; float cy = (row + 0.5f + y) * stride; float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride; float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride; det->bbox[0] = cx; det->bbox[1] = cy; det->bbox[2] = w1; det->bbox[3] = h1; det->angle = angle; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; // const int maxGrids = mStridesLength; // int grids[maxGrids][2]; // for (int i = 0; i < maxGrids; ++i) { // grids[i][0] = mYoloV8netHeight / mStrides[i]; // grids[i][1] = mYoloV8NetWidth / mStrides[i]; // } int maxGrids = mStridesLength; int flatGridsLen = 2 * maxGrids; int* flatGrids = new int[flatGridsLen]; for (int i = 0; i < maxGrids; ++i) { flatGrids[2 * i] = mYoloV8netHeight / mStrides[i]; flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { // Access the elements of the original 2D array from the flattened 1D array int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements if (numElem < mThreadCount) // Adjust the thread count if needed mThreadCount = numElem; // The CUDA kernel call remains unchanged CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_); } delete[] flatGrids; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int netinfo_count = 9; int class_count = combinedInfo[0]; int numberofpoints = combinedInfo[1]; float confthreshkeypoints = combinedInfo[2]; int input_w = combinedInfo[3]; int input_h = combinedInfo[4]; int max_output_object_count = combinedInfo[5]; bool is_segmentation = combinedInfo[6]; bool is_pose = combinedInfo[7]; bool is_obb = combinedInfo[8]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov12-tubro/plugin/yololayer.h ================================================ #pragma once #include #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; int mNumberofpoints; float mConfthreshkeypoints; int mYoloV8NetWidth; int mYoloV8netHeight; int mMaxOutObject; bool is_segmentation_; bool is_pose_; bool is_obb_; int* mStrides; int mStridesLength; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov12-tubro/readme.md ================================================ ## Introduction Yolov12 model supports TensorRT-8. Detection training code [link](https://github.com/sunsmarterjie/yolov12/releases/tag/turbo) Segment training code[link](https://github.com/sunsmarterjie/yolov12/releases/tag/seg) Classify training code[link](https://github.com/sunsmarterjie/yolov12/releases/tag/cls) ## Environment * cuda 11.6 * cudnn 8.9.1.23 * tensorrt 8.6.1.6 * opencv 4.8.0 * ultralytics 8.3.63 ## Support * [x] YOLO12-det support FP32/FP16 and C++ API * [x] YOLO12-seg support FP32/FP16 and C++ API * [x] YOLO12-cls support FP32/FP16 and C++ API ## Config * Choose the YOLO12 sub-model n/s/m/l/x from command line arguments. * Other configs please check [src/config.h](src/config.h) ## Build and Run (Detection) 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # You are supposed to train your own models instead of using the pre-trained models # to download other models, replace 'yolov12n.pt' with 'yolov12s.pt', 'yolov12m.pt', 'yolov12l.pt' or 'yolov12x.pt' # Generate .wts cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py . python gen_wts.py -w yolov12n.pt -o yolov12n.wts -t detect # A file 'yolov12n.wts' will be generated. ``` 2. build tensorrtx/yolov12 and run ```shell cd [PATH-TO-TENSORRTX]/yolov12 mkdir build cd build cmake .. make ``` ## Build and Run (Segment) 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # You are supposed to train your own models instead of using the pre-trained models to download other models, replace 'yolov12n-seg.pt' with 'yolov12s-seg.pt', 'yolov12m-seg.pt', 'yolov12l-seg.pt' or 'yolov12x-seg.pt' # Generate .wts cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py . python gen_wts.py -w yolov12n.pt -o yolov12n.wts -t seg # A file 'yolov12n.wts' will be generated. ``` 2. build tensorrtx/yolov12 and run ```shell cd [PATH-TO-TENSORRTX]/yolov12 mkdir build cd build cmake .. make ``` ## Build and Run (Classify) 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # Download ultralytics # You are supposed to train your own models instead of using the pre-trained models to download other models, replace 'yolov12n-cls.pt' with 'yolov12s-cls.pt', 'yolov12m-cls.pt', 'yolov12l-cls.pt' or 'yolov12x-cls.pt' # Generate .wts cp [PATH-TO-TENSORRTX]/yolov12/gen_wts.py . python gen_wts.py -w yolov12n-cls.pt -t cls -o yolov12n-cls.wts # A file 'yolov12n-cls.wts' will be generated. ``` 2. build tensorrtx/yolov12 and run ```shell cd [PATH-TO-TENSORRTX]/yolov12 mkdir build cd build cmake .. make ``` ### Detection ```shell cp [PATH-TO-ultralytics]/yolov12n.wts . # Build and serialize TensorRT engine ./yolov12_det -s yolov12n.wts yolov12n.engine [n/s/m/l/x] # Run inference ./yolov12_det -d yolov12n.engine ../images [c/g] # results saved in build directory ``` ### Segment ```shell cp [PATH-TO-ultralytics]/yolov2n-seg.wts . # Build and serialize TensorRT engine ./yolov12-seg -s yolov12n-seg.wts yolov12n-seg.engine [n/s/m/l/x] # Run inference ./yolov12-seg -d yolov12n-seg.engine ../images # results saved in build directory ``` ### Classify ```shell cp [PATH-TO-ultralytics]/yolov2n-cls.wts . # Build and serialize TensorRT engine ./yolov12-cls -s yolov12n-cls.wts yolov12n-cls.engine [n/s/m/l/x] # Run inference ./yolov12-cls -d yolov12n-cls.engine ../images # results saved in build directory ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov12-tubro/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "model.h" #include "yololayer.h" std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; // std::cout << "===========name: " << name << std::endl; } return WeightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int p, int g, int d) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); conv->setNbGroups(g); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, std::vector k1, std::vector k2, float e, int g, std::string lname) { int c_ = (int)((float)c2 * e); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2", 0, g); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return conv2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) { nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); softmax->setAxes(1 << 1); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, bool is_segmentation, bool is_pose, bool is_obb) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const int netinfo_count = 9; // Assuming the first 5 elements are for netinfo as per existing code. const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. std::vector combinedInfo(total_count); int class_num = kNumClass; if (is_pose) class_num = kPoseNumClass; else if (is_obb) class_num = kObbNumClass; int input_w = kInputW; if (is_obb) input_w = kObbInputW; int input_h = kInputH; if (is_obb) input_h = kObbInputH; // Fill in the first 5 elements as per existing netinfo. combinedInfo[0] = class_num; combinedInfo[1] = kNumberOfPoints; combinedInfo[2] = kConfThreshKeypoints; combinedInfo[3] = input_w; combinedInfo[4] = input_h; combinedInfo[5] = kMaxNumOutputBbox; combinedInfo[6] = is_segmentation; combinedInfo[7] = is_pose; combinedInfo[8] = is_obb; // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); // Now let's create the PluginField object to hold this combined information. nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); // Create the PluginFieldCollection to hold the PluginField object. nvinfer1::PluginFieldCollection pluginFieldCollection; pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array pluginFieldCollection.fields = &pluginField; // Create the plugin object using the PluginFieldCollection. nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // We assume that the plugin is to be added onto the network. // Prepare input tensors for the YOLO Layer. std::vector inputTensors; for (auto det : dets) { inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. } // Add the plugin to the network using the prepared input tensors. nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); return yoloLayer; // Return the added YOLO layer. } nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c_out, std::string lname, int k, int s, int padding, int g, bool act) { nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, c_out, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], emptywts); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k / 2; int p1 = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); if (act) { nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } else return bn; } nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setNbGroups(ch); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, int n, bool shortcut, int g, float e, int k) { int c_ = c2 * float(e); nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2"); nvinfer1::ITensor* y = cv1->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b = bottleneck(network, weightMap, *y, c_, c_, shortcut, {k, k}, {k, k}, 1.0, g, lname + ".m." + std::to_string(i)); y = b->getOutput(0); } nvinfer1::ITensor* inputTensor[] = {y, cv2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2); nvinfer1::IElementWiseLayer* cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3"); return cv3; } nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool c3k, float e, int g, bool shortcut) { int c = int(c2 * float(e)); nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, 2 * c, lname + ".cv1", 1, 1); nvinfer1::ISliceLayer* sl0 = network->addSlice( *cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2, cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* sl1 = network->addSlice( *cv1->getOutput(0), nvinfer1::Dims4{0, cv1->getOutput(0)->getDimensions().d[1] / 2, 0, 0}, nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2, cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {sl0->getOutput(0), sl1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* current = sl1->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b; if (c3k) { b = C3k(network, weightMap, *current, c, lname + ".m." + std::to_string(i), 2, shortcut, g); } else { b = bottleneck(network, weightMap, *current, c, c, shortcut, {3, 3}, {3, 3}, 0.5, g, lname + ".m." + std::to_string(i)); } current = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return cv2; } nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area) { nvinfer1::Dims d_input = input.getDimensions(); int B = d_input.d[0]; int C = d_input.d[1]; int H = d_input.d[2]; int W = d_input.d[3]; int N = W * H; int head_dim = dim / num_heads; int all_head_dim = head_dim * num_heads; nvinfer1::ILayer* qk = Conv(network, weightMap, input, all_head_dim * 2, lname + ".qk", 1, 1, 0, 1, false); nvinfer1::IShuffleLayer* qk_flatten_t = network->addShuffle(*qk->getOutput(0)); qk_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N}); qk_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); nvinfer1::ILayer* v = Conv(network, weightMap, input, all_head_dim, lname + ".v", 1, 1, 0, 1, false); nvinfer1::IShuffleLayer* v_flatten_t = network->addShuffle(*v->getOutput(0)); v_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N}); v_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); // (1, 6400, 64) nvinfer1::ILayer* pe = Conv(network, weightMap, *v->getOutput(0), dim, lname + ".pe", 5, 1, 2, dim, false); nvinfer1::ITensor* q_k = qk_flatten_t->getOutput(0); nvinfer1::ITensor* v_ = v_flatten_t->getOutput(0); if (area > 1) { B = B * area; N = N / area; nvinfer1::IShuffleLayer* qk_reshape = network->addShuffle(*qk_flatten_t->getOutput(0)); qk_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C * 2}); nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_flatten_t->getOutput(0)); v_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C}); q_k = qk_reshape->getOutput(0); v_ = v_reshape->getOutput(0); } nvinfer1::Dims q_k_dim = q_k->getDimensions(); nvinfer1::ISliceLayer* q = network->addSlice(*q_k, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* k = network->addSlice(*q_k, nvinfer1::Dims3{0, 0, q_k_dim.d[2] / 2}, nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* q_reshape = network->addShuffle(*q->getOutput(0)); q_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); nvinfer1::IShuffleLayer* k_reshape = network->addShuffle(*k->getOutput(0)); k_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_); v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); // (B, N, num_head, head_dim)--->(B, num_head, head_dim, N) nvinfer1::IShuffleLayer* q_t_view = network->addShuffle(*q_reshape->getOutput(0)); q_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::IShuffleLayer* k_t_view = network->addShuffle(*k_reshape->getOutput(0)); k_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::IShuffleLayer* v_t_view = network->addShuffle(*v_reshape->getOutput(0)); v_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::IShuffleLayer* q_T = network->addShuffle(*q_t_view->getOutput(0)); q_T->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); // (B, num_head, N, head_dim, N) nvinfer1::IMatrixMultiplyLayer* q_mul_k = network->addMatrixMultiply(*q_T->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k_t_view->getOutput(0), nvinfer1::MatrixOperation::kNONE); float scale = 1.0 / sqrt(head_dim); float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; // scale float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; // shift float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; // power nvinfer1::IScaleLayer* q_mul_k_scale = network->addScale(*q_mul_k->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); nvinfer1::IReduceLayer* attn_max = network->addReduce(*q_mul_k_scale->getOutput(0), nvinfer1::ReduceOperation::kMAX, 1 << 3, true); nvinfer1::IElementWiseLayer* attn_sub = network->addElementWise( *q_mul_k_scale->getOutput(0), *attn_max->getOutput(0), nvinfer1::ElementWiseOperation::kSUB); nvinfer1::IUnaryLayer* attn_exp = network->addUnary(*attn_sub->getOutput(0), nvinfer1::UnaryOperation::kEXP); nvinfer1::IReduceLayer* attn_sum = network->addReduce(*attn_exp->getOutput(0), nvinfer1::ReduceOperation::kSUM, 1 << 3, true); nvinfer1::IElementWiseLayer* attn_div = network->addElementWise(*attn_exp->getOutput(0), *attn_sum->getOutput(0), nvinfer1::ElementWiseOperation::kDIV); nvinfer1::IShuffleLayer* attn_t = network->addShuffle(*attn_div->getOutput(0)); attn_t->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); nvinfer1::IMatrixMultiplyLayer* attn_v = network->addMatrixMultiply(*v_t_view->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attn_t->getOutput(0), nvinfer1::MatrixOperation::kNONE); nvinfer1::IShuffleLayer* attn_v_t = network->addShuffle(*attn_v->getOutput(0)); attn_v_t->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2}); nvinfer1::ITensor* attn_temp = attn_v_t->getOutput(0); if (area > 1) { B = B / area; N = N * area; nvinfer1::IShuffleLayer* attn_v_t_r = network->addShuffle(*attn_v_t->getOutput(0)); attn_v_t_r->setReshapeDimensions(nvinfer1::Dims3{B, N, C}); attn_temp = attn_v_t_r->getOutput(0); } nvinfer1::IShuffleLayer* attn_x = network->addShuffle(*attn_temp); attn_x->setReshapeDimensions(nvinfer1::Dims4{B, H, W, C}); attn_x->setSecondTranspose(nvinfer1::Permutation{0, 3, 1, 2}); nvinfer1::IElementWiseLayer* x_add_pp = network->addElementWise(*attn_x->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::ILayer* proj = Conv(network, weightMap, *x_add_pp->getOutput(0), dim, lname + ".proj", 1, 1, 0, 1, false); return proj; } nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, float mlp_ratio, int area) { nvinfer1::ILayer* attn = AAttn(network, weightMap, input, dim, num_heads, lname + ".attn", area); nvinfer1::IElementWiseLayer* add1 = // x = x + self.attn(x) network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); int mlp_hidden_dim = int(dim * mlp_ratio); nvinfer1::ILayer* mlp_0 = Conv(network, weightMap, *add1->getOutput(0), mlp_hidden_dim, lname + ".mlp.0", 1, 1, 0, 1, true); nvinfer1::ILayer* mlp_1 = Conv(network, weightMap, *mlp_0->getOutput(0), dim, lname + ".mlp.1", 1, 1, 0, 1, false); nvinfer1::IElementWiseLayer* result = network->addElementWise(*add1->getOutput(0), *mlp_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return result; } nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2, int area, bool residual, float mlp_ratio, float e, int g, bool shortcut) { int c_ = static_cast(c2 * e); assert(c_ % 32 == 0 && "Dimension of ABlock must be a multiple of 32"); int num_heads = c_ / 32; nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1", 1, 1); std::vector y{cv1->getOutput(0)}; nvinfer1::ITensor* current = cv1->getOutput(0); for (int i = 0; i < n; i++) { if (a2) { nvinfer1::ILayer* m_0 = ABlock(network, weightMap, *current, c_, num_heads, lname + ".m." + std::to_string(i) + ".0", mlp_ratio, area); nvinfer1::ILayer* m_1 = ABlock(network, weightMap, *m_0->getOutput(0), c_, num_heads, lname + ".m." + std::to_string(i) + ".1", mlp_ratio, area); current = m_1->getOutput(0); } else { // C3k nvinfer1::ILayer* m = C3k(network, weightMap, *current, c_, lname + ".m." + std::to_string(i), 2, shortcut, g); current = m->getOutput(0); } y.push_back(current); } nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), static_cast(y.size())); cat->setAxis(1); nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2", 1, 1); if (a2 && residual) { // std::cout << lname << " applying residual connection with gamma" << std::endl; nvinfer1::Weights gamma = weightMap[lname + ".gamma"]; nvinfer1::IConstantLayer* gamma_layer = network->addConstant(nvinfer1::Dims4{1, c2, 1, 1}, gamma); nvinfer1::IElementWiseLayer* scaled_output = network->addElementWise( *gamma_layer->getOutput(0), *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* result = network->addElementWise(input, *scaled_output->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return result; } else { return cv2; } } ================================================ FILE: yolov12-tubro/src/calibrator.cpp ================================================ #include "calibrator.h" #include #include #include #include #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov12-tubro/src/model.cpp ================================================ #include #include #include "block.h" //#include "calibrator.h" #include "config.h" #include "model.h" static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = std::min(x, max_channels); channel = int(ceil((channel * gw) / divisor)) * divisor; return channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } static nvinfer1::IElementWiseLayer* convBnSiLUProto(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setName((lname + ".conv").c_str()); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); bn->setName((lname + ".bn").c_str()); // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized. // Error Code 10: Internal Error (Could not find any implementation for node // model.22.proto.cv3.conv + model.22.proto.cv3.sigmoid + PWN(PWN((Unnamed Layer* 353) [Activation]), PWN(model.22.proto.cv3.silu)).) #if defined(USE_INT8) nvinfer1::ITensor* inputTensors[] = {bn->getOutput(0)}; auto concat = network->addConcatenation(inputTensors, 1); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*concat->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(sigmoid); bn->setName((lname + ".sigmoid").c_str()); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*concat->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); ew->setName((lname + ".silu").c_str()); #else nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(sigmoid); bn->setName((lname + ".sigmoid").c_str()); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); ew->setName((lname + ".silu").c_str()); #endif return ew; } static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, float gw, int max_channels) { int mid_channel = get_width(256, gw, max_channels); auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, {3, 3}, 1, lname + ".cv1"); // float *convTranpsose_bais = (float *) weightMap["model.23.proto.upsample.bias"].values; // int convTranpsose_bais_len = weightMap["model.23.proto.upsample.bias"].count; // nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len}; auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2}, weightMap[lname + ".upsample.weight"], weightMap[lname + ".upsample.bias"]); assert(convTranpsose); convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2}); convTranpsose->setPadding(nvinfer1::DimsHW{0, 0}); auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, {3, 3}, 1, lname + ".cv2"); auto cv3 = convBnSiLUProto(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, lname + ".cv3"); assert(cv3); return cv3; } static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw, const std::string& algo_type, int max_channels) { int nm_nk = 0; int c4 = 0; if (algo_type == "seg") { nm_nk = 32; c4 = std::max(get_width(256, gw, max_channels) / 4, nm_nk); } else if (algo_type == "pose") { nm_nk = kNumberOfPoints * 3; c4 = std::max(get_width(256, gw, max_channels) / 4, kNumberOfPoints * 3); } auto cv0 = convBnSiLU(network, weightMap, input, c4, {3, 3}, 1, lname + ".0"); auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), c4, {3, 3}, 1, lname + ".1"); float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values; int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count; nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len}; auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), nm_nk, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".2" + ".weight"], cv2_bais); cv2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0)); cv2_shuffle->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, nm_nk, grid_shape}); return cv2_shuffle; } void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } void calculateStrides(nvinfer1::ILayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } nvinfer1::IHostMemory* buildEngineYolov12Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, std::string& type, int max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW}); assert(data); nvinfer1::ILayer* conv0 = Conv(network, weightMap, *data, get_width(64, gw, max_channels), "model.0", 3, 2); nvinfer1::ILayer* conv1 = Conv(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), "model.1", 3, 2, 1, 2); bool c3k2 = false; if (type == "m" || type == "l" || type == "x") { c3k2 = true; } float mlp_ratio = 2.0; bool residual = true; if (type == "l" || type == "x") { //mlp_ratio = 1.5; // if use the official's pretrained model,you are supposed to use 1.5 mlp_ratio = 1; // your ownself 's model // residual = true; } nvinfer1::ILayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd), "model.2", c3k2, 0.25); nvinfer1::ILayer* conv3 = Conv(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), "model.3", 3, 2, 1, 4); nvinfer1::ILayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.4", c3k2, 0.25); nvinfer1::ILayer* conv5 = Conv(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), "model.5", 3, 2); nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_depth(4, gd), "model.6", true, 1, residual, mlp_ratio); nvinfer1::ILayer* conv7 = Conv(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), "model.7", 3, 2); nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio); nvinfer1::ILayer* conv_class = Conv(network, weightMap, *conv8->getOutput(0), 1280, "model.9.conv"); nvinfer1::Dims dim = conv_class->getOutput(0)->getDimensions(); assert(dim.nbDims == 4); nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{dim.d[2], dim.d[3]}); nvinfer1::IShuffleLayer* shuffle_0 = network->addShuffle(*pool2->getOutput(0)); shuffle_0->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, 1280}); auto linear_weight = weightMap["model.9.linear.weight"]; auto constant_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, 1280}, linear_weight); auto constant_bias = network->addConstant(nvinfer1::Dims2{kBatchSize, kClsNumClass}, weightMap["model.9.linear.bias"]); auto linear_matrix_multipy = network->addMatrixMultiply(*shuffle_0->getOutput(0), nvinfer1::MatrixOperation::kNONE, *constant_weight->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE); auto yolo = network->addElementWise(*linear_matrix_multipy->getOutput(0), *constant_bias->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); assert(yolo); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Set the maximum batch size and workspace size config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); // Configuration according to the precision mode being used #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kClsInputW, kClsInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif // Begin building the engine; this may take a while std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Cleanup the network definition and allocated weights delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // ===================== input =================================================== nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); // ===================== backbone =================================================== nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1", 1, 2); bool c3k2 = false; if (type == "m" || type == "l" || type == "x") { c3k2 = true; } float mlp_ratio = 2.0; bool residual = false; if (type == "l" || type == "x") { mlp_ratio = 1.5; // see the yolov12-seg/ultralytics/nn/tasks.py/parse_model() residual = true; } /* nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition * network, std::map & weightMap, nvinfer1::ITensor & input, int c2, int n, std::string lname, bool c3k, float e, int g, bool shortcut)*/ nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd), "model.2", c3k2, 0.25); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3", 1, 4); nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.4", c3k2, 0.25); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); /*nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition * network, std::map weightMap, nvinfer1::ITensor & input, int c2, int n, std::string lname, bool a2, int area, bool residual, float mlp_ratio, float e, int g, bool shortcut)*/ nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_depth(4, gd), "model.6", true, 4, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio); // ========================= neck ==================================================================== float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample9 = network->addResize(*conv8->getOutput(0)); upsample9->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample9->setScales(scale, 4); nvinfer1::ITensor* inputTensors10[] = {upsample9->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat10 = network->addConcatenation(inputTensors10, 2); /*nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition * network, std::map weightMap, nvinfer1::ITensor & input, int c2, std::string lname, int n, bool a2, int area, bool residual, float mlp_ratio, float e, int g, bool shortcut)*/ nvinfer1::ILayer* conv11 = A2C2f(network, weightMap, *cat10->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.11", false, -1, residual, mlp_ratio); nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0)); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 4); nvinfer1::ITensor* inputTensors13[] = {upsample12->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat13 = network->addConcatenation(inputTensors13, 2); nvinfer1::ILayer* conv14 = A2C2f(network, weightMap, *cat13->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd), "model.14", false, -1, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv15 = convBnSiLU(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.15"); nvinfer1::ITensor* inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)}; nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2); nvinfer1::ILayer* conv17 = A2C2f(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.17", false, -1, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.18"); nvinfer1::ITensor* inputTensors19[] = {conv18->getOutput(0), conv8->getOutput(0)}; nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensors19, 2); nvinfer1::IElementWiseLayer* conv20 = C3K2(network, weightMap, *cat19->getOutput(0), get_width(1024, gw, max_channels), get_depth(2, gd), "model.20", true); // =============================== output =================================================================== int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); // output0 location nvinfer1::IElementWiseLayer* conv21_cv2_0_0 = convBnSiLU(network, weightMap, *conv14->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.0"); nvinfer1::IElementWiseLayer* conv21_cv2_0_1 = convBnSiLU(network, weightMap, *conv21_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.1"); nvinfer1::IConvolutionLayer* conv21_cv2_0_2 = network->addConvolutionNd(*conv21_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.0.2.weight"], weightMap["model.21.cv2.0.2.bias"]); conv21_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // output0 classes auto* conv21_cv3_0_0_0 = DWConv(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.21.cv3.0.0.0"); nvinfer1::IElementWiseLayer* conv21_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.0.1"); auto* conv21_cv3_0_1_0 = DWConv(network, weightMap, *conv21_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.0.1.0"); nvinfer1::IElementWiseLayer* conv21_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_0_1_2 = network->addConvolutionNd(*conv21_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.0.2.weight"], weightMap["model.21.cv3.0.2.bias"]); conv21_cv3_0_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv21_cv3_0_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::ITensor* inputTensors21_0[] = {conv21_cv2_0_2->getOutput(0), conv21_cv3_0_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_0 = network->addConcatenation(inputTensors21_0, 2); // out1 location nvinfer1::IElementWiseLayer* conv21_cv2_1_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.0"); nvinfer1::IElementWiseLayer* conv21_cv2_1_1 = convBnSiLU(network, weightMap, *conv21_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.1"); nvinfer1::IConvolutionLayer* conv21_cv2_1_2 = network->addConvolutionNd(*conv21_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.1.2.weight"], weightMap["model.21.cv2.1.2.bias"]); conv21_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // out1 classes auto* conv21_cv3_1_0_0 = DWConv(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.21.cv3.1.0.0"); nvinfer1::IElementWiseLayer* conv21_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.0.1"); auto* conv21_cv3_1_1_0 = DWConv(network, weightMap, *conv21_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.1.1.0"); nvinfer1::IElementWiseLayer* conv21_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_1_1_2 = network->addConvolutionNd(*conv21_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.1.2.weight"], weightMap["model.21.cv3.1.2.bias"]); conv21_cv3_1_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv21_cv3_1_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::ITensor* inputTensors21_1[] = {conv21_cv2_1_2->getOutput(0), conv21_cv3_1_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_1 = network->addConcatenation(inputTensors21_1, 2); // out2 location nvinfer1::IElementWiseLayer* conv21_cv2_2_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.0"); nvinfer1::IElementWiseLayer* conv21_cv2_2_1 = convBnSiLU(network, weightMap, *conv21_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.1"); nvinfer1::IConvolutionLayer* conv21_cv2_2_2 = network->addConvolutionNd(*conv21_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.2.2.weight"], weightMap["model.21.cv2.2.2.bias"]); conv21_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // out2 classes auto* conv21_cv3_2_0_0 = DWConv(network, weightMap, *conv20->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.21.cv3.2.0.0"); nvinfer1::IElementWiseLayer* conv21_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv20->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.0.1"); auto* conv21_cv3_2_1_0 = DWConv(network, weightMap, *conv21_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.2.1.0"); nvinfer1::IElementWiseLayer* conv21_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_2_1_2 = network->addConvolutionNd(*conv21_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.2.2.weight"], weightMap["model.21.cv3.2.2.bias"]); conv21_cv3_2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv3_2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor21_2[] = {conv21_cv2_2_2->getOutput(0), conv21_cv3_2_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_2 = network->addConcatenation(inputTensor21_2, 2); // ============================================ yolov12 detect ========================================= nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle21_0 = network->addShuffle(*cat21_0->getOutput(0)); shuffle21_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split21_0_0 = network->addSlice( *shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_0_1 = network->addSlice(*shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_0 = DFL(network, weightMap, *split21_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.21.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl21_0->getOutput(0), split21_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2); cat22_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle21_1 = network->addShuffle(*cat21_1->getOutput(0)); shuffle21_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split21_1_0 = network->addSlice( *shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_1_1 = network->addSlice(*shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_1 = DFL(network, weightMap, *split21_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.21.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl21_1->getOutput(0), split21_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2); cat22_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle21_2 = network->addShuffle(*cat21_2->getOutput(0)); shuffle21_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split21_2_0 = network->addSlice( *shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_2_1 = network->addSlice(*shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_2 = DFL(network, weightMap, *split21_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.21.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl21_2->getOutput(0), split21_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2); cat22_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, true, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 64 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov12Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // ===================== input =================================================== nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); // ===================== backbone =================================================== nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), {3, 3}, 2, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), {3, 3}, 2, "model.1", 1, 2); bool c3k2 = false; if (type == "m" || type == "l" || type == "x") { c3k2 = true; } float mlp_ratio = 2.0; bool residual = true; if (type == "l" || type == "x") { mlp_ratio = 1; // see the yolov12-seg/ultralytics/nn/tasks.py/parse_model() // residual = true; } nvinfer1::IElementWiseLayer* conv2 = C3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd), "model.2", c3k2, 0.25); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3", 1, 4); nvinfer1::IElementWiseLayer* conv4 = C3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.4", c3k2, 0.25); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.5"); nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_depth(4, gd), "model.6", true, 4, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 2, "model.7"); nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio); // ========================= neck ==================================================================== float scale[] = {1.0, 1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample9 = network->addResize(*conv8->getOutput(0)); upsample9->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample9->setScales(scale, 4); nvinfer1::ITensor* inputTensors10[] = {upsample9->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat10 = network->addConcatenation(inputTensors10, 2); nvinfer1::ILayer* conv11 = A2C2f(network, weightMap, *cat10->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.11", false, -1, residual, mlp_ratio); nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0)); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 4); nvinfer1::ITensor* inputTensors13[] = {upsample12->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat13 = network->addConcatenation(inputTensors13, 2); nvinfer1::ILayer* conv14 = A2C2f(network, weightMap, *cat13->getOutput(0), get_width(256, gw, max_channels), get_depth(2, gd), "model.14", false, -1, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv15 = convBnSiLU(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.15"); nvinfer1::ITensor* inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)}; nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2); nvinfer1::ILayer* conv17 = A2C2f(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels), get_depth(2, gd), "model.17", false, -1, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 2, "model.18"); nvinfer1::ITensor* inputTensors19[] = {conv18->getOutput(0), conv8->getOutput(0)}; nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensors19, 2); nvinfer1::IElementWiseLayer* conv20 = C3K2(network, weightMap, *cat19->getOutput(0), get_width(1024, gw, max_channels), get_depth(2, gd), "model.20", true); // =============================== output =================================================================== int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); // output0 location nvinfer1::IElementWiseLayer* conv21_cv2_0_0 = convBnSiLU(network, weightMap, *conv14->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.0"); nvinfer1::IElementWiseLayer* conv21_cv2_0_1 = convBnSiLU(network, weightMap, *conv21_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.0.1"); nvinfer1::IConvolutionLayer* conv21_cv2_0_2 = network->addConvolutionNd(*conv21_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.0.2.weight"], weightMap["model.21.cv2.0.2.bias"]); conv21_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // output0 classes auto* conv21_cv3_0_0_0 = DWConv(network, weightMap, *conv14->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.21.cv3.0.0.0"); nvinfer1::IElementWiseLayer* conv21_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.0.1"); auto* conv21_cv3_0_1_0 = DWConv(network, weightMap, *conv21_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.0.1.0"); nvinfer1::IElementWiseLayer* conv21_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_0_1_2 = network->addConvolutionNd(*conv21_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.0.2.weight"], weightMap["model.21.cv3.0.2.bias"]); conv21_cv3_0_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv21_cv3_0_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::ITensor* inputTensors21_0[] = {conv21_cv2_0_2->getOutput(0), conv21_cv3_0_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_0 = network->addConcatenation(inputTensors21_0, 2); // out1 location nvinfer1::IElementWiseLayer* conv21_cv2_1_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.0"); nvinfer1::IElementWiseLayer* conv21_cv2_1_1 = convBnSiLU(network, weightMap, *conv21_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.1.1"); nvinfer1::IConvolutionLayer* conv21_cv2_1_2 = network->addConvolutionNd(*conv21_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.1.2.weight"], weightMap["model.21.cv2.1.2.bias"]); conv21_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // out1 classes auto* conv21_cv3_1_0_0 = DWConv(network, weightMap, *conv17->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.21.cv3.1.0.0"); nvinfer1::IElementWiseLayer* conv21_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.0.1"); auto* conv21_cv3_1_1_0 = DWConv(network, weightMap, *conv21_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.1.1.0"); nvinfer1::IElementWiseLayer* conv21_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_1_1_2 = network->addConvolutionNd(*conv21_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.1.2.weight"], weightMap["model.21.cv3.1.2.bias"]); conv21_cv3_1_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv21_cv3_1_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::ITensor* inputTensors21_1[] = {conv21_cv2_1_2->getOutput(0), conv21_cv3_1_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_1 = network->addConcatenation(inputTensors21_1, 2); // out2 location nvinfer1::IElementWiseLayer* conv21_cv2_2_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.0"); nvinfer1::IElementWiseLayer* conv21_cv2_2_1 = convBnSiLU(network, weightMap, *conv21_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.21.cv2.2.1"); nvinfer1::IConvolutionLayer* conv21_cv2_2_2 = network->addConvolutionNd(*conv21_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv2.2.2.weight"], weightMap["model.21.cv2.2.2.bias"]); conv21_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // out2 classes auto* conv21_cv3_2_0_0 = DWConv(network, weightMap, *conv20->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.21.cv3.2.0.0"); nvinfer1::IElementWiseLayer* conv21_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv21_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.0.1"); auto* conv21_cv3_2_1_0 = DWConv(network, weightMap, *conv21_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.21.cv3.2.1.0"); nvinfer1::IElementWiseLayer* conv21_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv21_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.21.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv21_cv3_2_1_2 = network->addConvolutionNd(*conv21_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.21.cv3.2.2.weight"], weightMap["model.21.cv3.2.2.bias"]); conv21_cv3_2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv21_cv3_2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor21_2[] = {conv21_cv2_2_2->getOutput(0), conv21_cv3_2_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat21_2 = network->addConcatenation(inputTensor21_2, 2); // ============================================ yolov12 detect ========================================= nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle21_0 = network->addShuffle(*cat21_0->getOutput(0)); shuffle21_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split21_0_0 = network->addSlice( *shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_0_1 = network->addSlice(*shuffle21_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_0 = DFL(network, weightMap, *split21_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.21.dfl.conv.weight"); auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv14->getOutput(0), "model.21.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg", max_channels); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl21_0->getOutput(0), split21_0_1->getOutput(0), proto_coef_0->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); cat22_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle21_1 = network->addShuffle(*cat21_1->getOutput(0)); shuffle21_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split21_1_0 = network->addSlice( *shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_1_1 = network->addSlice(*shuffle21_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_1 = DFL(network, weightMap, *split21_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.21.dfl.conv.weight"); auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv17->getOutput(0), "model.21.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg", max_channels); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl21_1->getOutput(0), split21_1_1->getOutput(0), proto_coef_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); cat22_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle21_2 = network->addShuffle(*cat21_2->getOutput(0)); shuffle21_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split21_2_0 = network->addSlice( *shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split21_2_1 = network->addSlice(*shuffle21_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl21_2 = DFL(network, weightMap, *split21_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.21.dfl.conv.weight"); auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.21.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg", max_channels); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl21_2->getOutput(0), split21_2_1->getOutput(0), proto_coef_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); cat22_dfl_2->setAxis(1); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, true, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); auto proto = Proto(network, weightMap, *conv14->getOutput(0), "model.21.proto", gw, max_channels); proto->getOutput(0)->setName(kProtoTensorName); network->markOutput(*proto->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov12-tubro/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_obb(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kObbInputW / (img.cols * 1.0); float r_h = kObbInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kObbInputH - r_w * img.rows) / 2; b = bbox[3] - (kObbInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kObbInputW - r_h * img.cols) / 2; r = bbox[2] - (kObbInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; // lmk[i + 2] } } else { l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; // lmk[i + 2] } } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), (std::min)(lbox[2], rbox[2]), (std::max)(lbox[1], rbox[1]), (std::min)(lbox[3], rbox[3]), }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; return interBoxS / unionBoxS; } static bool cmp(const Detection& a, const Detection& b) { if (a.conf == b.conf) { return a.bbox[0] < b.bbox[0]; } return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4])) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; res.push_back(det); } } } void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch) { const std::vector> skeleton_pairs = { {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); for (int k = 0; k < kNumberOfPoints * 3; k += 3) { if (res[j].keypoints[k + 2] > 0.5) { cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, cv::Scalar(0, 0x27, 0xC1), -1); } } for (const auto& bone : skeleton_pairs) { int kp1_idx = bone.first * 3; int kp2_idx = bone.second * 3; if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); } } } } } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; det.angle = decode_ptr_host[basic_pos + 7]; res.push_back(det); } } } void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } std::tuple convariance_matrix(Detection res) { float w = res.bbox[2]; float h = res.bbox[3]; float a = w * w / 12.0; float b = h * h / 12.0; float c = res.angle; float cos_r = std::cos(c); float sin_r = std::sin(c); float cos_r2 = cos_r * cos_r; float sin_r2 = sin_r * sin_r; float a_val = a * cos_r2 + b * sin_r2; float b_val = a * sin_r2 + b * cos_r2; float c_val = (a - b) * cos_r * sin_r; return std::make_tuple(a_val, b_val, c_val); } static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; std::tuple matrix1 = {a1, b1, c1}; std::tuple matrix2 = {a2, b2, c2}; matrix1 = convariance_matrix(res1); matrix2 = convariance_matrix(res2); a1 = std::get<0>(matrix1); b1 = std::get<1>(matrix1); c1 = std::get<2>(matrix1); a2 = std::get<0>(matrix2); b2 = std::get<1>(matrix2); c2 = std::get<2>(matrix2); float x1 = res1.bbox[0], y1 = res1.bbox[1]; float x2 = res2.bbox[0], y2 = res2.bbox[1]; float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t3 = std::log( ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) / (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = std::max(std::min(bd, 100.0f), eps); float hd = std::sqrt(1.0 - std::exp(-bd) + eps); return 1 - hd; } void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (probiou(item, dets[n]) >= nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms_obb(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } static std::vector get_corner(cv::Mat& img, const Detection& box) { float cos_value, sin_value; // Calculate center point and width/height float x1 = box.bbox[0]; float y1 = box.bbox[1]; float w = box.bbox[2]; float h = box.bbox[3]; float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees // Print original angle std::cout << "Original angle: " << angle << std::endl; // Swap width and height if height is greater than or equal to width if (h >= w) { std::swap(w, h); angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180) } // Ensure the angle is between 0 and 180 degrees if (angle < 0) { angle += 360.0f; // Convert to positive value } if (angle > 180.0f) { angle -= 180.0f; // Subtract 180 from angles greater than 180 } // Print adjusted angle std::cout << "Adjusted angle: " << angle << std::endl; // Convert to normal angle value float normal_angle = fmod(angle, 180.0f); if (normal_angle < 0) { normal_angle += 180.0f; // Ensure it's a positive value } // Print normal angle value std::cout << "Normal angle: " << normal_angle << std::endl; cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians sin_value = std::sin(angle * CV_PI / 180.0f); // Calculate each corner point float l = x1 - w / 2; // Left boundary float r = x1 + w / 2; // Right boundary float t = y1 - h / 2; // Top boundary float b = y1 + h / 2; // Bottom boundary // Use get_rect function to scale the coordinates float bbox[4] = {l, t, r, b}; cv::Rect rect = get_rect_obb(img, bbox); float x_ = (rect.x + rect.x + rect.width) / 2; // Center x float y_ = (rect.y + rect.y + rect.height) / 2; // Center y float width = rect.width; // Width float height = rect.height; // Height // Calculate each corner point std::vector corner_points(4); float vec1x = width / 2 * cos_value; float vec1y = width / 2 * sin_value; float vec2x = -height / 2 * sin_value; float vec2y = height / 2 * cos_value; corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner corner_points[2] = cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner // Check and adjust corner points to ensure the rectangle is parallel to image boundaries for (auto& point : corner_points) { point.x = std::max(0, std::min(point.x, img.cols - 1)); point.y = std::max(0, std::min(point.y, img.rows - 1)); } return corner_points; } void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; auto& img = img_batch[i]; for (auto& obj : res) { auto color = colors[(int)obj.class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); auto corner_points = get_corner(img, obj); cv::polylines(img, std::vector>{corner_points}, true, bgr, 1); auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf)); cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr); int width = textsize.width; int height = textsize.height; bool outside = (corner_points[0].y - height >= 3) ? true : false; cv::Point p1(corner_points[0].x, corner_points[0].y), p2; p2.x = corner_points[0].x + width; if (outside) { p2.y = corner_points[0].y - height - 3; } else { p2.y = corner_points[0].y + height + 3; } cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA); cv::putText( img, text, cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)), 0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA); } } } ================================================ FILE: yolov12-tubro/src/postprocess.cu ================================================ // // Created by lindsay on 23-7-17. // #include "postprocess.h" #include "types.h" static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; //[center_x center_y w h conf class_id mask[32] keypoints[51] angle] float cx = pitem[0]; float cy = pitem[1]; float width = pitem[2]; float height = pitem[3]; float label = pitem[5]; float angle = pitem[89]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = cx; *pout_item++ = cy; *pout_item++ = width; *pout_item++ = height; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore *pout_item++ = angle; } static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; float left = pitem[0]; float top = pitem[1]; float right = pitem[2]; float bottom = pitem[3]; float label = pitem[5]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) { float a_val = w * w / 12.0f; float b_val = h * h / 12.0f; float cos_r = cosf(r); float sin_r = sinf(r); a = a_val * cos_r * cos_r + b_val * sin_r * sin_r; b = a_val * sin_r * sin_r + b_val * cos_r * cos_r; c = (a_val - b_val) * sin_r * cos_r; } static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2, float h2, float r2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; convariance_matrix(w1, h1, r1, a1, b1, c1); convariance_matrix(w2, h2, r2, a2, b2, c2); float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) / (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = fmaxf(fminf(bd, 100.0f), eps); float hd = sqrtf(1.0f - expf(-bd) + eps); return 1 - hd; } static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1], pitem[2], pitem[3], pitem[7]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel<<>>(parray, max_objects, nms_threshold); } void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel_obb<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel_obb<<>>(parray, max_objects, nms_threshold); } ================================================ FILE: yolov12-tubro/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov12-tubro/yolov12_cls.cpp ================================================ #include "calibrator.h" #include "config.h" #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "utils.h" #include #include #include #include #include using namespace nvinfer1; static Logger gLogger; const static int kOutputSize = kClsNumClass; void batch_preprocess(std::vector& imgs, float* output, int dst_width = 224, int dst_height = 224) { for (size_t b = 0; b < imgs.size(); b++) { int h = imgs[b].rows; int w = imgs[b].cols; int m = std::min(h, w); int top = (h - m) / 2; int left = (w - m) / 2; cv::Mat img = imgs[b](cv::Rect(left, top, m, m)); cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR); cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img.convertTo(img, CV_32F, 1 / 255.0); std::vector channels(3); cv::split(img, channels); // CHW format for (int c = 0; c < 3; ++c) { int i = 0; for (int row = 0; row < dst_height; ++row) { for (int col = 0; col < dst_width; ++col) { output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] = channels[c].at(row, col); ++i; } } } } } std::vector softmax(float* prob, int n) { std::vector res; float sum = 0.0f; float t; for (int i = 0; i < n; i++) { t = expf(prob[i]); res.push_back(t); sum += t; } for (int i = 0; i < n; i++) { res[i] /= sum; } return res; } std::vector topk(const std::vector& vec, int k) { std::vector topk_index; std::vector vec_index(vec.size()); std::iota(vec_index.begin(), vec_index.end(), 0); std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; }); int k_num = std::min(vec.size(), k); for (int i = 0; i < k_num; ++i) { topk_index.push_back(vec_index[i]); } return topk_index; } std::vector read_classes(std::string file_name) { std::vector classes; std::ifstream ifs(file_name, std::ios::in); if (!ifs.is_open()) { std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl; assert(0); } std::string s; while (std::getline(ifs, s)) { classes.push_back(s); } ifs.close(); return classes; } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir, std::string& type, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto net = std::string(argv[4]); if (net[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (net[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (net[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (net[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (net[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW]; *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void serialize_engine(float& gd, float& gw, std::string& wts_name, std::string& engine_name, std::string& type, int max_channels) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolov12Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw, type, max_channels); assert(serialized_engine); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { // yolov12-cls -s ../models/yolov12n-cls.wts ../models/yolov12-cls.fp32.trt n // yolov12-cls -d ../models/yolov12n-cls.fp32.trt ../images cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; float gd = 0.0f, gw = 0.0f; std::string img_dir; std::string type; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir, type, max_channels)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov12-cls -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolov12-cls -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(gd, gw, wts_name, engine_name, type, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Prepare cpu and gpu buffers float* device_buffers[2]; float* cpu_input_buffer = nullptr; float* output_buffer_host = nullptr; prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // Read imagenet labels auto classes = read_classes("imagenet_classes.txt"); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess batch_preprocess(img_batch, cpu_input_buffer); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // Postprocess and get top-k result for (size_t b = 0; b < img_name_batch.size(); b++) { float* p = &output_buffer_host[b * kOutputSize]; auto res = softmax(p, kOutputSize); auto topk_idx = topk(res, 3); std::cout << img_name_batch[b] << std::endl; for (auto idx : topk_idx) { std::cout << " " << classes[idx] << " " << res[idx] << std::endl; } } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] cpu_input_buffer; delete[] output_buffer_host; // Destroy the engine delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolov12-tubro/yolov12_cls_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import os import shutil import sys import threading import time import cv2 import numpy as np import torch import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret class YoLov12TRT(object): """ description: A YOLOv12 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] self.mean = (0.485, 0.456, 0.406) self.std = (0.229, 0.224, 0.225) for binding in engine: print('binding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape( binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_input_image = np.empty( shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): batch_image_raw.append(image_raw) input_image = self.preprocess_cls_image(image_raw) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls( output) cv2.putText(batch_image_raw[i], str( classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) print(classes_ls, predicted_conf_ls) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224): """ description: Convert BGR image to RGB, crop the center square frame, resize it to target size, normalize to [0,1], transform to NCHW format. param: raw_bgr_image: numpy array, raw BGR image dst_width: int, target image width dst_height: int, target image height return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape # Crop the center square frame m = min(h, w) top = (h - m) // 2 left = (w - m) // 2 image = raw_bgr_image[top:top + m, left:left + m] # Resize the image with target size while maintaining ratio image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR) # Convert BGR to RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Normalize to [0,1] image = image.astype(np.float32) / 255.0 # HWC to CHW format image = image.transpose(2, 0, 1) # CHW to NCHW format (add batch dimension) image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order" image = np.ascontiguousarray(image) batch_data = np.expand_dims(image, axis=0) return batch_data def postprocess_cls(self, output_data): classes_ls = [] predicted_conf_ls = [] category_id_ls = [] output_data = output_data.reshape(self.batch_size, -1) output_data = torch.Tensor(output_data) p = torch.nn.functional.softmax(output_data, dim=1) score, index = torch.topk(p, 3) for ind in range(index.shape[0]): input_category_id = index[ind][0].item() # 716 category_id_ls.append(input_category_id) predicted_confidence = score[ind][0].item() predicted_conf_ls.append(predicted_confidence) classes_ls.append(classes[input_category_id]) return classes_ls, predicted_conf_ls, category_id_ls class inferThread(threading.Thread): def __init__(self, yolov12_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov12_wrapper = yolov12_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov12_wrapper.infer( self.yolov12_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format( self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov12_wrapper): threading.Thread.__init__(self) self.yolov12_wrapper = yolov12_wrapper def run(self): batch_image_raw, use_time = self.yolov12_wrapper.infer( self.yolov12_wrapper.get_raw_image_zeros()) print( 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) # with open("imagenet_classes.txt") as f: # classes = [line.strip() for line in f.readlines()] classes = ["daisy", "dandelion", "rose", "sunflower", "tulip"] if __name__ == "__main__": # load custom plugin and engine engine_file_path = "build/yolov12n-cls-5.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov12TRT instance yolov12_wrapper = YoLov12TRT(engine_file_path) try: print('batch size is', yolov12_wrapper.batch_size) image_dir = "images" image_path_batches = get_img_path_batches( yolov12_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov12_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov12_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov12_wrapper.destroy() ================================================ FILE: yolov12-tubro/yolov12_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolov12Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { // yolov12_det -s ../models/yolov12n.wts ../models/yolov12n.fp32.trt n // yolov12_det -d ../models/yolov12n.fp32.trt ../images c cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string cuda_post_process; std::string type; int model_bboxes; float gd = 0, gw = 0; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov12_det -s [.wts] [.engine] [n/s/m/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov12_det -d [.engine] ../images [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); // Save the first 100 values of output_buffer_host, one per line // std::ofstream out("../models/output.txt"); // for (int j = 0; j < 100; j++) { // out << output_buffer_host[j] << std::endl; // } // out.close(); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov12-tubro/yolov12_det_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLo11 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLo12TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the num of boxes detected num = int(output[0]) print("There are {} detections in the picture!!!".format(num)) # Reshape to a two dimentional ndarray # pred = np.reshape(output[1:], (-1, 38))[:num, :] pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/yolov12n-det.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["object"] # categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", # "traffic light", # "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", # "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", # "frisbee", # "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", # "surfboard", # "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", # "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", # "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", # "cell phone", # "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", # "teddy bear", # "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolov12_wrapper = YoLo12TRT(engine_file_path) try: print('batch size is', yolov12_wrapper.batch_size) image_dir = "images" image_path_batches = get_img_path_batches(yolov12_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov12_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov12_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov12_wrapper.destroy() ================================================ FILE: yolov12-tubro/yolov12_seg.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1; const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4); static cv::Rect get_downscale_rect(float bbox[4], float scale) { float left = bbox[0]; float top = bbox[1]; float right = bbox[0] + bbox[2]; float bottom = bbox[1] + bbox[3]; left = left < 0 ? 0 : left; top = top < 0 ? 0 : top; right = right > kInputW ? kInputW : right; bottom = bottom > kInputH ? kInputH : bottom; left /= scale; top /= scale; right /= scale; bottom /= scale; return cv::Rect(int(left), int(top), int(right - left), int(bottom - top)); } std::vector process_mask(const float* proto, int proto_size, std::vector& dets) { std::vector masks; for (size_t i = 0; i < dets.size(); i++) { cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); auto r = get_downscale_rect(dets[i].bbox, 4); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float e = 0.0f; for (int j = 0; j < 32; j++) { e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; } e = 1.0f / (1.0f + expf(-e)); mask_mat.at(y, x) = e; } } cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); masks.push_back(mask_mat); } return masks; } void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolov12Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 3); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); const int outputIndex_seg = engine->getBindingIndex("proto"); assert(inputIndex == 0); assert(outputIndex == 1); assert(outputIndex_seg == 2); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { std::cout << "kOutputSize:" << kOutputSize << std::endl; CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl; CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, std::string& labels_filename, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); std::string sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'm') { gd = 0.50; gw = 1.00; max_channels = 512; type = "m"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 6) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); labels_filename = std::string(argv[5]); } else { return false; } return true; } int main(int argc, char** argv) { // yolo11_seg -s ../models/yolo11n-seg.wts ../models/yolo11n-seg.fp32.trt n // yolo11_seg -d ../models/yolo11n-seg.fp32.trt ../images c coco.txt cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string type; std::string cuda_post_process; std::string labels_filename = "coco.txt"; int model_bboxes; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, labels_filename, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolo11_seg -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolo11_seg -d [.engine] ../images [c/g] coco_file// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[3]; float* output_buffer_host = nullptr; float* output_seg_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } std::unordered_map labels_map; read_labels(labels_filename, labels_map); assert(kNumClass == labels_map.size()); prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); for (size_t b = 0; b < img_batch.size(); b++) { auto& res = res_batch[b]; cv::Mat img = img_batch[b]; auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res); draw_mask_bbox(img, res, masks, labels_map); cv::imwrite("_" + img_name_batch[b], img); } } else if (cuda_post_process == "g") { // Process gpu decode and nms results // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); // todo seg in gpu std::cerr << "seg_postprocess is not support in gpu right now" << std::endl; } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(device_buffers[2])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; delete[] output_seg_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} // std::cout << std::endl; return 0; } ================================================ FILE: yolov12-tubro/yolov12_seg_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLo11 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLo12TRT(object): """ description: A YOLO11 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings # Data length self.det_output_length = host_outputs[0].shape[0] self.seg_output_length = host_outputs[1].shape[0] self.seg_w = int(self.input_w / 4) self.seg_h = int(self.input_h / 4) self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w)) self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM # Draw mask self.colors_obj = Colors() def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] output_proto_mask = host_outputs[1] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid, result_proto_coef = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) if result_proto_coef.shape[0] == 0: continue result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i], batch_origin_w[i]) self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid], im_src=batch_image_raw[i]) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) print("There are {} detections ".format(num)) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid, result_proto_coef def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, 5] == boxes[:, 5] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def scale_mask(self, mask, ih, iw): mask = cv2.resize(mask, (self.input_w, self.input_h)) r_w = self.input_w / (iw * 1.0) r_h = self.input_h / (ih * 1.0) if r_h > r_w: w = self.input_w h = int(r_w * ih) x = 0 y = int((self.input_h - h) / 2) else: w = int(r_h * iw) h = self.input_h x = int((self.input_w - w) / 2) y = 0 crop = mask[y:y + h, x:x + w] crop = cv2.resize(crop, (iw, ih)) return crop def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw): """ description: Mask pred by yolo11 instance segmentation , param: output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input result_proto_coef: prototype mask coefficients (n, 32), n represents n results result_boxes : ih: rows of original image iw: cols of original image return: mask_result: (n, ih, iw) """ result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w) c, mh, mw = result_proto_masks.shape print(result_proto_masks.shape) print(result_proto_coef.shape) masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw) mask_result = [] for mask, box in zip(masks, result_boxes): mask_s = np.zeros((ih, iw)) crop_mask = self.scale_mask(mask, ih, iw) x1 = int(box[0]) y1 = int(box[1]) x2 = int(box[2]) y2 = int(box[3]) crop = crop_mask[y1:y2, x1:x2] crop = np.where(crop >= 0.5, 1, 0) crop = crop.astype(np.uint8) mask_s[y1:y2, x1:x2] = crop mask_result.append(mask_s) mask_result = np.array(mask_result) return mask_result def draw_mask(self, masks, colors_, im_src, alpha=0.5): """ description: Draw mask on image , param: masks : result_mask colors_: color to draw mask im_src : original image alpha : scale between original image and mask return: no return """ if len(masks) == 0: return masks = np.asarray(masks, dtype=np.uint8) masks = np.ascontiguousarray(masks.transpose(1, 2, 0)) masks = np.asarray(masks, dtype=np.float32) colors_ = np.asarray(colors_, dtype=np.float32) s = masks.sum(2, keepdims=True).clip(0, 1) masks = (masks @ colors_).clip(0, 255) im_src[:] = masks * alpha + im_src * (1 - s * alpha) class inferThread(threading.Thread): def __init__(self, yolo11_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolo11_wrapper): threading.Thread.__init__(self) self.yolo11_wrapper = yolo11_wrapper def run(self): batch_image_raw, use_time = self.yolo11_wrapper.infer(self.yolo11_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) class Colors: def __init__(self): hexs = ('FF95C8', 'FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF37C7') self.palette = [self.hex2rgb(f'#{c}') for c in hexs] self.n = len(self.palette) def __call__(self, i, bgr=False): c = self.palette[int(i) % self.n] return (c[2], c[1], c[0]) if bgr else c @staticmethod def hex2rgb(h): # rgb order (PIL) return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = 'build/libmyplugins.so' engine_file_path = "build/yolov12n-seg-4.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["QT", "CT", "VT", "XT"] # categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", # "traffic light", # "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", # "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", # "frisbee", # "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", # "surfboard", # "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", # "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", # "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", # "cell phone", # "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", # "teddy bear", # "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLo11TRT instance yolov12_wrapper = YoLo12TRT(engine_file_path) try: print('batch size is', yolov12_wrapper.batch_size) image_dir = "images" image_path_batches = get_img_path_batches(yolov12_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov12_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov12_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov12_wrapper.destroy() ================================================ FILE: yolov13/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov13) # Set up environment-based paths for CUDA and TensorRT if(DEFINED ENV{CUDA_HOME}) set(CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_HOME}) else() set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda") endif() if(DEFINED ENV{TENSORRT_DIR}) set(TENSORRT_ROOT $ENV{TENSORRT_DIR}) else() set(TENSORRT_ROOT "/opt/TensorRT-8.6.1.6") endif() message(STATUS "Using CUDA from: ${CUDA_TOOLKIT_ROOT_DIR}") message(STATUS "Using TensorRT from: ${TENSORRT_ROOT}") add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # CUDA and TensorRT configuration if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/lib) include_directories(${TENSORRT_ROOT}/include) link_directories(${TENSORRT_ROOT}/lib) else() message("embed_platform off") include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64) include_directories(${TENSORRT_ROOT}/include) link_directories(${TENSORRT_ROOT}/lib) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV REQUIRED) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolov13-det ${PROJECT_SOURCE_DIR}/yolov13_det.cpp ${SRCS}) target_link_libraries(yolov13-det nvinfer) target_link_libraries(yolov13-det cudart) target_link_libraries(yolov13-det myplugins) target_link_libraries(yolov13-det ${OpenCV_LIBS}) ================================================ FILE: yolov13/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output pt_file, wts_file = parse_args() print('Generating .wts for detection model') # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32 # Anchor handling for detection model anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] delattr(model.model[-1], 'anchors') model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') # python3 gen_wts.py -w your_model.pt -o output_name.wts ================================================ FILE: yolov13/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" using namespace std; std::map loadWeights(const std::string file); nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int p = 0, int g = 1, int d = 1); nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c_out, std::string lname, int k = 1, int s = 1, int padding = 0, int g = 1, bool act = true); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num); nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, int n = 1, bool shortcut = true, int g = 1, float e = 0.5, int k = 3); nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool c3k = false, float e = 0.5, int g = 1, bool shortcut = true); nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area = 1); nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname); nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, float mlp_ratio = 1.2, int area = 1); nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2 = true, int area = 1, bool residual = false, float mlp_ratio = 2.0, float e = 0.5, int g = 1, bool shortcut = true); nvinfer1::IElementWiseLayer* DSConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c_in, int c_out, std::string lname, int k = 3, int s = 1, int p = 0, int d = 1, bool bias = false); nvinfer1::ILayer* DSBottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, std::string lname, bool shortcut = true, float e = 0.5, int k1 = 3, int k2 = 5, int d2 = 1); nvinfer1::ILayer* DSC3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool shortcut = true, int g = 1, float e = 0.5, int k1 = 3, int k2 = 5, int d2 = 1); nvinfer1::ILayer* DSC3K2(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, int n = 1, bool dsc3k = false, float e = 0.5, int g = 1, bool shortcut = true, int k1 = 3, int k2 = 7, int d2 = 1); nvinfer1::ILayer* FuseModule(nvinfer1::INetworkDefinition* network, std::map weightMap, std::vector& input, int c_in, bool channel_adjust, std::string lname); // nvinfer1::ILayer* FuseModule(nvinfer1::INetworkDefinition* network, std::map weightMap, // std::vectorinput, int c_in, bool channel_adjust, std::string lname); nvinfer1::ISoftMaxLayer* AdaHyperedgeGen(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int node_dim, int num_hyperedges, std::string lname, int num_heads = 4, std::string context = "both"); nvinfer1::IElementWiseLayer* GELU(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input); nvinfer1::IElementWiseLayer* AdaHGConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int embed_dim, std::string lname, int num_hyperedges = 16, int num_heads = 4, std::string context = "both"); nvinfer1::IShuffleLayer* AdaHGComputation(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int embed_dim, std::string lname, int num_hyperedges = 16, int num_heads = 8, std::string context = "both"); nvinfer1::ILayer* C3AH(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, float e = 1.0, int num_hyperedges = 8, std::string context = "both"); nvinfer1::ILayer* HyperACE(nvinfer1::INetworkDefinition* network, std::map weightMap, std::vector input, int c1, int c2, std::string lname, int n = 1, int num_hyperedges = 8, bool dsc3k = false, bool shortcut = false, float e1 = 0.5, float e2 = 1, std::string context = "both", bool channel_adjust = true); nvinfer1::IElementWiseLayer* FullPad_Tunnel(nvinfer1::INetworkDefinition* network, std::map weightMap, std::vector input, std::string lname); nvinfer1::ILayer* DownsampleConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int in_channels, std::string lname, bool channel_adjust = true); void cout_dim(nvinfer1::ITensor& input); ================================================ FILE: yolov13/include/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolov13/include/config.h ================================================ #define USE_FP16 // #define USE_FP32 // #define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static int kNumClass = 80; const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; //Quantization input image folder path const static char* kInputQuantizationFolder = "./tensorrtx-int8calib-data/coco_calib"; ================================================ FILE: yolov13/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov13/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov13/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov13/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolov13Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); nvinfer1::IHostMemory* buildEngineYolov13Det_debug(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type); ================================================ FILE: yolov13/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" // Preprocessing functions cv::Rect get_rect(cv::Mat& img, float bbox[4]); // Processing functions void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); // NMS functions void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); // CUDA-related functions void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); // Drawing functions void draw_bbox(std::vector& img_batch, std::vector>& res_batch); ================================================ FILE: yolov13/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov13/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; }; struct AffineMatrix { float value[3]; }; const int bbox_element = sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolov13/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); // std::cout << "Found file: " << cur_file_name << std::endl; file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov13/plugin/geluKernel.cu ================================================ /* * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #if CUDA_VERSION >= 10010 #include #include #include "NvInfer.h" #include "common/bertCommon.h" #include "common/common.cuh" #include "common/serialize.hpp" #include "geluPlugin.h" using namespace nvinfer1; namespace nvinfer1 { namespace plugin { namespace bert { // constants for approximating the normal cdf constexpr float A = 0.5f; constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) template __global__ void geluKernel(const T a, const T b, const T c, int n, const T* input, T* output) { const int idx = blockIdx.x * TPB + threadIdx.x; if (idx < n) { const T in = input[idx]; const T cdf = a + a * tanh(in * (c * in * in + b)); output[idx] = in * cdf; } } int computeGelu(cudaStream_t stream, int n, const float* input, float* output) { constexpr int blockSize = 256; const int gridSize = (n + blockSize - 1) / blockSize; geluKernel<<>>(A, B, C, n, input, output); PLUGIN_CHECK(cudaPeekAtLastError()); return 0; } int computeGelu(cudaStream_t stream, int n, const half* input, half* output) { constexpr int blockSize = 256; if (0 == (n & 1)) { const int n2 = n / 2; const int gridSize = (n2 + blockSize - 1) / blockSize; const half2 A2 = __floats2half2_rn(A, A); const half2 B2 = __floats2half2_rn(B, B); const half2 C2 = __floats2half2_rn(C, C); const half2* input2 = reinterpret_cast(input); half2* output2 = reinterpret_cast(output); geluKernel<<>>(A2, B2, C2, n2, input2, output2); } else { const int gridSize = (n + blockSize - 1) / blockSize; geluKernel<<>>(A, B, C, n, input, output); } PLUGIN_CHECK(cudaPeekAtLastError()); return 0; } template __global__ void geluBiasKernel(const T a, const T b, const T c, T* output, const T* input, const T* bias, const int ld) { const int offset = blockIdx.x * ld; for (int it = threadIdx.x; it < ld; it += TPB) { const int idx = it + offset; const T in = input[idx] + bias[it]; const T cdf = a + a * tanh(in * (c * in * in + b)); output[idx] = in * cdf; } } int computeGeluBias(float* output, const float* input, const float* bias, const int ld, const int cols, cudaStream_t stream) { geluBiasKernel<<>>(A, B, C, output, input, bias, ld); return cudaPeekAtLastError(); } int computeGeluBias(half* output, const half* input, const half* bias, const int ld, const int cols, cudaStream_t stream) { if (ld & 1) { geluBiasKernel<<>>(A, B, C, output, input, bias, ld); } else { const half2 A2 = __floats2half2_rn(A, A); const half2 B2 = __floats2half2_rn(B, B); const half2 C2 = __floats2half2_rn(C, C); const int ld2 = ld / 2; const half2* input2 = reinterpret_cast(input); const half2* bias2 = reinterpret_cast(bias); half2* output2 = reinterpret_cast(output); geluBiasKernel<<>>(A2, B2, C2, output2, input2, bias2, ld2); } return cudaPeekAtLastError(); } } // namespace bert } // namespace plugin } // namespace nvinfer1 #endif // CUDA_VERSION >= 10010 ================================================ FILE: yolov13/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides, int stridesLength) { mClassCount = classCount; mYoloV13NetWidth = netWidth; mYoloV13netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mYoloV13NetWidth); read(d, mYoloV13netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mYoloV13NetWidth); write(d, mYoloV13netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV13netHeight) + sizeof(mYoloV13NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength; } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV13NetWidth, mYoloV13netHeight, mMaxOutObject, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV13netHeight, mYoloV13NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; int total_grid = grid_h * grid_w; int info_len = 4 + classes; int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); if (count >= maxoutobject) return; int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV13netHeight, int mYoloV13NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; int maxGrids = mStridesLength; int flatGridsLen = 2 * maxGrids; int* flatGrids = new int[flatGridsLen]; for (int i = 0; i < maxGrids; ++i) { flatGrids[2 * i] = mYoloV13netHeight / mStrides[i]; flatGrids[2 * i + 1] = mYoloV13NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { // Access the elements of the original 2D array from the flattened 1D array int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements if (numElem < mThreadCount) // Adjust the thread count if needed mThreadCount = numElem; CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem); } delete[] flatGrids; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); // Clean packed layout: class_num, input_w, input_h, max_out int class_count = combinedInfo[0]; int input_w = combinedInfo[1]; int input_h = combinedInfo[2]; int max_output_object_count = combinedInfo[3]; int stride_offset = 4; const int* px_arry = combinedInfo + stride_offset; int px_arry_length = fc->fields[0].length - stride_offset; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov13/plugin/yololayer.h ================================================ #pragma once #include #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV13netHeight, int mYoloV13NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; // Removed non-detection members int mYoloV13netHeight; int mYoloV13NetWidth; int mMaxOutObject; int* mStrides; int mStridesLength; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov13/readme.md ================================================ ## Introduction Yolov13 model supports TensorRT-8. Detection training code [link](https://github.com/iMoonLab/yolov13/releases/tag/yolov13) ## Environment * cuda 11.6 * cudnn 8.9.1.23 * tensorrt 8.6.1.6 * opencv 4.8.0 * ultralytics 8.3.63 ## Support * [x] YOLOV13-det support FP32/FP16/INT8 and C++ API ## Config * Choose the YOLOV13 sub-model n/s/l/x from command line arguments. * Other configs please check [include/config.h](include/config.h) ## Build and Run (Detection) 1. generate .wts from pytorch with .pt, or download .wts from model zoo ```shell # Download ultralytics wget https://github.com/iMoonLab/yolov13/releases/tag/yolov13 -O ultralytics-8.3.63.zip # Unzip ultralytics unzip ultralytics-8.3.63.zip cd ultralytics-8.3.63 # Training your ownself models to download other models, replace 'yolov13n.pt' with 'yolov13s.pt', 'yolov13l.pt', or 'yolov13x.pt' # Generate .wts cp [PATH-TO-TENSORRTX]/yolov13/gen_wts.py . python3 gen_wts.py -w yolov13n.pt -o yolov13n.wts # A file 'yolov13n.wts' will be generated. ``` 2. build tensorrtx/yolov13 and run ```shell cd [PATH-TO-TENSORRTX]/yolov13 mkdir build cd build cmake .. make ``` ### Detection ```shell cp [PATH-TO-ultralytics]/yolov13n.wts . # Build and serialize TensorRT engine ./yolov13-det -s yolov13n.wts yolov13n-det.engine [n/s/l/x] # Run inference ./yolov13-det -d yolov13n-det.engine ../images [c/g] # results saved in build directory ``` ## INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download the calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in [PATH-TO-TENSORRTX]/yolov13/build 3. set the macro `USE_INT8` in include/config.h and make again 4. serialize the model and test ... build successfully in my 4060 ... ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov13/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "model.h" #include "yololayer.h" std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; // std::cout << "===========name: " << name << std::endl; } return WeightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { cout << "BatchNorm's name : " << lname << endl; float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname, int p, int g, int d) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); conv->setNbGroups(g); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } static nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, std::vector k1, std::vector k2, float e, int g, std::string lname) { int c_ = (int)((float)c2 * e); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, k1, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, k2, 1, lname + ".cv2", 0, g); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return conv2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) { nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); softmax->setAxes(1 << 1); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); // Packing: class_num, input_w, input_h, max_out const int netinfo_count = 4; const int total_count = netinfo_count + px_arry_num; std::vector combinedInfo(total_count); // Fill in the first 4 elements combinedInfo[0] = kNumClass; combinedInfo[1] = kInputW; combinedInfo[2] = kInputH; combinedInfo[3] = kMaxNumOutputBbox; // Copy the contents of px_arry into the combinedInfo vector std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); // Now let's create the PluginField object to hold this combined information. nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); // Create the PluginFieldCollection nvinfer1::PluginFieldCollection pluginFieldCollection; pluginFieldCollection.nbFields = 1; pluginFieldCollection.fields = &pluginField; // Create the plugin object nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // Prepare input tensors for the YOLO Layer. std::vector inputTensors; for (auto det : dets) { inputTensors.push_back(det->getOutput(0)); } // Add the plugin to the network nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); return yoloLayer; } nvinfer1::ILayer* Conv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c_out, std::string lname, int k, int s, int padding, int g, bool act) { nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0}; cout << "Conv name: " << lname << endl; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, c_out, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], emptywts); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); // auto pad int p0 = k / 2; int p1 = k / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); if (act) { nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } else return bn; } nvinfer1::ILayer* DWConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, std::vector k, int s, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k[0], k[1]}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setNbGroups(ch); // auto pad int p0 = k[0] / 2; int p1 = k[1] / 2; conv->setPaddingNd(nvinfer1::DimsHW{p0, p1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::IElementWiseLayer* C3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, int n, bool shortcut, int g, float e, int k) { int c_ = c2 * float(e); nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, {1, 1}, 1, lname + ".cv2"); nvinfer1::ITensor* y = cv1->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b = bottleneck(network, weightMap, *y, c_, c_, shortcut, {k, k}, {k, k}, 1.0, g, lname + ".m." + std::to_string(i)); y = b->getOutput(0); } nvinfer1::ITensor* inputTensor[] = {y, cv2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2); nvinfer1::IElementWiseLayer* cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv3"); return cv3; } nvinfer1::IElementWiseLayer* C3K2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool c3k, float e, int g, bool shortcut) { int c = int(c2 * float(e)); nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, 2 * c, lname + ".cv1", 1, 1); nvinfer1::ISliceLayer* sl0 = network->addSlice( *cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2, cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* sl1 = network->addSlice( *cv1->getOutput(0), nvinfer1::Dims4{0, cv1->getOutput(0)->getDimensions().d[1] / 2, 0, 0}, nvinfer1::Dims4{cv1->getOutput(0)->getDimensions().d[0], cv1->getOutput(0)->getDimensions().d[1] / 2, cv1->getOutput(0)->getDimensions().d[2], cv1->getOutput(0)->getDimensions().d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {sl0->getOutput(0), sl1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* current = sl1->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* b; if (c3k) { b = C3k(network, weightMap, *current, c, lname + ".m." + std::to_string(i), 2, shortcut, g); } else { b = bottleneck(network, weightMap, *current, c, c, shortcut, {3, 3}, {3, 3}, 0.5, g, lname + ".m." + std::to_string(i)); } current = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, {1, 1}, 1, lname + ".cv2"); return cv2; } void cout_dim(nvinfer1::ITensor& input) { nvinfer1::Dims d = input.getDimensions(); std::cout << "======================= Dimensions =================================" << std::endl; std::cout << " " << d.d[0] << std::endl; std::cout << " " << d.d[1] << std::endl; std::cout << " " << d.d[2] << std::endl; std::cout << " " << d.d[3] << std::endl; std::cout << "======================================================================" << std::endl; } nvinfer1::ILayer* AAttn(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, int area) { nvinfer1::Dims d_input = input.getDimensions(); int B = d_input.d[0]; int C = d_input.d[1]; int H = d_input.d[2]; int W = d_input.d[3]; int N = W * H; int head_dim = dim / num_heads; int all_head_dim = head_dim * num_heads; nvinfer1::ILayer* qk = Conv(network, weightMap, input, all_head_dim * 2, lname + ".qk", 1, 1, 0, 1, false); nvinfer1::IShuffleLayer* qk_flatten_t = network->addShuffle(*qk->getOutput(0)); qk_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N}); qk_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); nvinfer1::ILayer* v = Conv(network, weightMap, input, all_head_dim, lname + ".v", 1, 1, 0, 1, false); nvinfer1::IShuffleLayer* v_flatten_t = network->addShuffle(*v->getOutput(0)); v_flatten_t->setReshapeDimensions(nvinfer1::Dims3{B, -1, N}); v_flatten_t->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); // (1, 6400, 64) nvinfer1::ILayer* pe = Conv(network, weightMap, *v->getOutput(0), dim, lname + ".pe", 5, 1, 2, dim, false); nvinfer1::ITensor* q_k = qk_flatten_t->getOutput(0); nvinfer1::ITensor* v_ = v_flatten_t->getOutput(0); if (area > 1) { B = B * area; N = N / area; nvinfer1::IShuffleLayer* qk_reshape = network->addShuffle(*qk_flatten_t->getOutput(0)); qk_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C * 2}); nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_flatten_t->getOutput(0)); v_reshape->setReshapeDimensions(nvinfer1::Dims3{B, N, C}); q_k = qk_reshape->getOutput(0); v_ = v_reshape->getOutput(0); } nvinfer1::Dims q_k_dim = q_k->getDimensions(); nvinfer1::ISliceLayer* q = network->addSlice(*q_k, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* k = network->addSlice(*q_k, nvinfer1::Dims3{0, 0, q_k_dim.d[2] / 2}, nvinfer1::Dims3{q_k_dim.d[0], q_k_dim.d[1], q_k_dim.d[2] / 2}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* q_reshape = network->addShuffle(*q->getOutput(0)); q_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); nvinfer1::IShuffleLayer* k_reshape = network->addShuffle(*k->getOutput(0)); k_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); nvinfer1::IShuffleLayer* v_reshape = network->addShuffle(*v_); v_reshape->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); // (B, N, num_head, head_dim)--->(B, num_head, head_dim, N) nvinfer1::IShuffleLayer* q_t_view = network->addShuffle(*q_reshape->getOutput(0)); q_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::IShuffleLayer* k_t_view = network->addShuffle(*k_reshape->getOutput(0)); k_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::IShuffleLayer* v_t_view = network->addShuffle(*v_reshape->getOutput(0)); v_t_view->setFirstTranspose(nvinfer1::Permutation{0, 2, 3, 1}); nvinfer1::IShuffleLayer* q_T = network->addShuffle(*q_t_view->getOutput(0)); q_T->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); // (B, num_head, N, head_dim, N) nvinfer1::IMatrixMultiplyLayer* q_mul_k = network->addMatrixMultiply(*q_T->getOutput(0), nvinfer1::MatrixOperation::kNONE, *k_t_view->getOutput(0), nvinfer1::MatrixOperation::kNONE); float scale = 1.0 / sqrt(head_dim); float* scale_val = reinterpret_cast(malloc(sizeof(float) * 1)); scale_val[0] = scale; nvinfer1::Weights s_w{nvinfer1::DataType::kFLOAT, scale_val, 1}; // scale float* shift_val = reinterpret_cast(malloc(sizeof(float) * 1)); shift_val[0] = 0; nvinfer1::Weights sh_w{nvinfer1::DataType::kFLOAT, shift_val, 1}; // shift float* power_val = reinterpret_cast(malloc(sizeof(float) * 1)); power_val[0] = 1; nvinfer1::Weights p_w{nvinfer1::DataType::kFLOAT, power_val, 1}; // power nvinfer1::IScaleLayer* q_mul_k_scale = network->addScale(*q_mul_k->getOutput(0), nvinfer1::ScaleMode::kUNIFORM, sh_w, s_w, p_w); nvinfer1::IReduceLayer* attn_max = network->addReduce(*q_mul_k_scale->getOutput(0), nvinfer1::ReduceOperation::kMAX, 1 << 3, true); nvinfer1::IElementWiseLayer* attn_sub = network->addElementWise( *q_mul_k_scale->getOutput(0), *attn_max->getOutput(0), nvinfer1::ElementWiseOperation::kSUB); nvinfer1::IUnaryLayer* attn_exp = network->addUnary(*attn_sub->getOutput(0), nvinfer1::UnaryOperation::kEXP); nvinfer1::IReduceLayer* attn_sum = network->addReduce(*attn_exp->getOutput(0), nvinfer1::ReduceOperation::kSUM, 1 << 3, true); nvinfer1::IElementWiseLayer* attn_div = network->addElementWise(*attn_exp->getOutput(0), *attn_sum->getOutput(0), nvinfer1::ElementWiseOperation::kDIV); cout_dim(*attn_div->getOutput(0)); nvinfer1::IShuffleLayer* attn_t = network->addShuffle(*attn_div->getOutput(0)); attn_t->setFirstTranspose(nvinfer1::Permutation{0, 1, 3, 2}); nvinfer1::IMatrixMultiplyLayer* attn_v = network->addMatrixMultiply(*v_t_view->getOutput(0), nvinfer1::MatrixOperation::kNONE, *attn_t->getOutput(0), nvinfer1::MatrixOperation::kNONE); nvinfer1::IShuffleLayer* attn_v_t = network->addShuffle(*attn_v->getOutput(0)); attn_v_t->setFirstTranspose(nvinfer1::Permutation{0, 3, 1, 2}); nvinfer1::ITensor* attn_temp = attn_v_t->getOutput(0); if (area > 1) { B = B / area; N = N * area; nvinfer1::IShuffleLayer* attn_v_t_r = network->addShuffle(*attn_v_t->getOutput(0)); attn_v_t_r->setReshapeDimensions(nvinfer1::Dims3{B, N, C}); attn_temp = attn_v_t_r->getOutput(0); } nvinfer1::IShuffleLayer* attn_x = network->addShuffle(*attn_temp); attn_x->setReshapeDimensions(nvinfer1::Dims4{B, H, W, C}); attn_x->setSecondTranspose(nvinfer1::Permutation{0, 3, 1, 2}); nvinfer1::IElementWiseLayer* x_add_pp = network->addElementWise(*attn_x->getOutput(0), *pe->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::ILayer* proj = Conv(network, weightMap, *x_add_pp->getOutput(0), dim, lname + ".proj", 1, 1, 0, 1, false); return proj; } nvinfer1::IElementWiseLayer* ABlock(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int dim, int num_heads, std::string lname, float mlp_ratio, int area) { nvinfer1::ILayer* attn = AAttn(network, weightMap, input, dim, num_heads, lname + ".attn", area); nvinfer1::IElementWiseLayer* add1 = // x = x + self.attn(x) network->addElementWise(input, *attn->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); int mlp_hidden_dim = int(dim * mlp_ratio); nvinfer1::ILayer* mlp_0 = Conv(network, weightMap, *add1->getOutput(0), mlp_hidden_dim, lname + ".mlp.0", 1, 1, 0, 1, true); nvinfer1::ILayer* mlp_1 = Conv(network, weightMap, *mlp_0->getOutput(0), dim, lname + ".mlp.1", 1, 1, 0, 1, false); nvinfer1::IElementWiseLayer* result = network->addElementWise(*add1->getOutput(0), *mlp_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return result; } nvinfer1::ILayer* A2C2f(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool a2, int area, bool residual, float mlp_ratio, float e, int g, bool shortcut) { int c_ = static_cast(c2 * e); assert(c_ % 32 == 0 && "Dimension of ABlock must be a multiple of 32"); int num_heads = c_ / 32; nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1", 1, 1); std::vector y{cv1->getOutput(0)}; nvinfer1::ITensor* current = cv1->getOutput(0); for (int i = 0; i < n; i++) { if (a2) { nvinfer1::ILayer* m_0 = ABlock(network, weightMap, *current, c_, num_heads, lname + ".m." + std::to_string(i) + ".0", mlp_ratio, area); nvinfer1::ILayer* m_1 = ABlock(network, weightMap, *m_0->getOutput(0), c_, num_heads, lname + ".m." + std::to_string(i) + ".1", mlp_ratio, area); current = m_1->getOutput(0); } else { // C3k nvinfer1::ILayer* m = C3k(network, weightMap, *current, c_, lname + ".m." + std::to_string(i), 2, shortcut, g); current = m->getOutput(0); } y.push_back(current); } nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), static_cast(y.size())); cat->setAxis(1); nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2", 1, 1); if (a2 && residual) { std::cout << lname << " applying residual connection with gamma" << std::endl; nvinfer1::Weights gamma = weightMap[lname + ".gamma"]; nvinfer1::IConstantLayer* gamma_layer = network->addConstant(nvinfer1::Dims4{1, c2, 1, 1}, gamma); nvinfer1::IElementWiseLayer* scaled_output = network->addElementWise( *gamma_layer->getOutput(0), *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* result = network->addElementWise(input, *scaled_output->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return result; } else { return cv2; } } nvinfer1::IElementWiseLayer* DSConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c_in, int c_out, std::string lname, int k, int s, int p, int d, bool bias) { if (p == 0) { p = (d * (k - 1)) / 2; } nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* dw = network->addConvolutionNd(input, c_in, nvinfer1::DimsHW{k, k}, weightMap[lname + ".dw.weight"], emptywts); dw->setStrideNd(nvinfer1::DimsHW{s, s}); dw->setPaddingNd(nvinfer1::DimsHW{p, p}); dw->setNbGroups(c_in); dw->setDilationNd(nvinfer1::DimsHW{d, d}); nvinfer1::IConvolutionLayer* pw = network->addConvolutionNd(*dw->getOutput(0), c_out, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".pw.weight"], emptywts); pw->setStrideNd(nvinfer1::DimsHW{1, 1}); pw->setPaddingNd(nvinfer1::DimsHW{0, 0}); pw->setNbGroups(1); pw->setDilationNd(nvinfer1::DimsHW{1, 1}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *pw->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::ILayer* DSBottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, std::string lname, bool shortcut, float e, int k1, int k2, int d2) { int c_ = float(e) * c2; nvinfer1::IElementWiseLayer* cv1 = DSConv(network, weightMap, input, c1, c_, lname + ".cv1", k1, 1, 0, 1, false); nvinfer1::IElementWiseLayer* y = DSConv(network, weightMap, *cv1->getOutput(0), c_, c2, lname + ".cv2", k2, 1, 0, d2, false); if (c1 == c2 && shortcut) { nvinfer1::IElementWiseLayer* add = network->addElementWise(input, *y->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return add; } else return y; } nvinfer1::ILayer* DSC3k(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, int n, std::string lname, bool shortcut, int g, float e, int k1, int k2, int d2) { int c_ = float(e) * c2; nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1", 1, 1); nvinfer1::ILayer* cv2 = Conv(network, weightMap, input, c_, lname + ".cv2", 1, 1); nvinfer1::ITensor* current = cv1->getOutput(0); for (int i = 0; i < n; i++) { nvinfer1::ILayer* m_ = DSBottleneck(network, weightMap, *current, c_, c_, lname + ".m." + std::to_string(i), shortcut, 1.0, k1, k2, d2); current = m_->getOutput(0); } nvinfer1::ITensor* inputTensors[] = {current, cv2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2); nvinfer1::ILayer* cv3 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv3", 1, 1); return cv3; } nvinfer1::ILayer* DSC3K2(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, int n, bool dsc3k, float e, int g, bool shortcut, int k1, int k2, int d2) { int c = float(e) * c2; nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, 2 * c, lname + ".cv1"); nvinfer1::Dims dim_cv1 = cv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* sl0 = network->addSlice( *cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{dim_cv1.d[0], dim_cv1.d[1] / 2, dim_cv1.d[2], dim_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* sl1 = network->addSlice( *cv1->getOutput(0), nvinfer1::Dims4{0, dim_cv1.d[1] / 2, 0, 0}, nvinfer1::Dims4{dim_cv1.d[0], dim_cv1.d[1] / 2, dim_cv1.d[2], dim_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); std::vector y = {sl0->getOutput(0), sl1->getOutput(0)}; nvinfer1::ITensor* current = sl1->getOutput(0); for (int i = 0; i < n; i++) { if (dsc3k) { nvinfer1::ILayer* m_ = DSC3k(network, weightMap, *current, c, 2, lname + ".m." + std::to_string(i), shortcut, g, 1.0, k1, k2, d2); current = m_->getOutput(0); y.push_back(current); } else { nvinfer1::ILayer* m_ = DSBottleneck(network, weightMap, *current, c, c, lname + ".m." + std::to_string(i), shortcut, 1.0, k1, k2, d2); current = m_->getOutput(0); y.push_back(current); } } nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), y.size()); nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2"); return cv2; } nvinfer1::ILayer* FuseModule(nvinfer1::INetworkDefinition* network, std::map weightMap, std::vector& input, int c_in, bool channel_adjust, std::string lname) { nvinfer1::IPoolingLayer* x1_ds = network->addPoolingNd(*input[0], nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{2, 2}); x1_ds->setStrideNd(nvinfer1::DimsHW{2, 2}); x1_ds->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IResizeLayer* x3_up = network->addResize(*input[2]); float scale[] = {1, 1, 2, 2}; x3_up->setResizeMode(nvinfer1::ResizeMode::kNEAREST); x3_up->setScales(scale, 4); nvinfer1::ITensor* inputTensor[] = {x1_ds->getOutput(0), input[1], x3_up->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 3); cat->setAxis(1); nvinfer1::ILayer* conv_out = Conv(network, weightMap, *cat->getOutput(0), c_in, lname + ".conv_out"); return conv_out; } nvinfer1::ISoftMaxLayer* AdaHyperedgeGen(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int node_dim, int num_hyperedges, std::string lname, int num_heads, std::string context) { nvinfer1::Dims dim_input = input.getDimensions(); int B = dim_input.d[0]; int N = dim_input.d[1]; int D = dim_input.d[2]; int head_dim = node_dim / num_heads; nvinfer1::ITensor* context_cat = nullptr; if (context == "mean") { nvinfer1::IReduceLayer* context_mean = network->addReduce(input, nvinfer1::ReduceOperation::kAVG, 1 << 1, false); context_cat = context_mean->getOutput(0); } else if (context == "max") { nvinfer1::IReduceLayer* context_max = network->addReduce(input, nvinfer1::ReduceOperation::kMAX, 1 << 1, false); context_cat = context_max->getOutput(0); } else { nvinfer1::IReduceLayer* context_mean = network->addReduce(input, nvinfer1::ReduceOperation::kAVG, 1 << 1, false); nvinfer1::IReduceLayer* context_max = network->addReduce(input, nvinfer1::ReduceOperation::kMAX, 1 << 1, false); nvinfer1::ITensor* inputTensor[] = {context_mean->getOutput(0), context_max->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2); cat->setAxis(1 << 0); context_cat = cat->getOutput(0); } nvinfer1::IShuffleLayer* context_cat_dim4 = network->addShuffle(*context_cat); context_cat_dim4->setReshapeDimensions( nvinfer1::Dims4{context_cat->getDimensions().d[0], context_cat->getDimensions().d[1], 1, 1}); nvinfer1::IFullyConnectedLayer* prototypes_offsets_ = network->addFullyConnected( *context_cat_dim4->getOutput(0), num_hyperedges * node_dim, weightMap[lname + ".context_net.weight"], weightMap[lname + ".context_net.bias"]); nvinfer1::IShuffleLayer* prototypes_offsets = network->addShuffle(*prototypes_offsets_->getOutput(0)); prototypes_offsets->setReshapeDimensions(nvinfer1::Dims3{B, num_hyperedges, D}); // prototype_offsets = self.context_net(context_cat).view(B, self.num_hyperedges, D) nvinfer1::Weights prototype_base_wts = weightMap[lname + ".prototype_base"]; nvinfer1::IConstantLayer* prototype_base = network->addConstant(nvinfer1::Dims3{1, num_hyperedges, node_dim}, prototype_base_wts); nvinfer1::IElementWiseLayer* prototypes = network->addElementWise( *prototype_base->getOutput(0), *prototypes_offsets->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); // prototypes = self.prototype_base.unsqueeze(0) + prototype_offsets nvinfer1::IShuffleLayer* input_dim4 = network->addShuffle(input); input_dim4->setReshapeDimensions(nvinfer1::Dims4{B * N, D, 1, 1}); nvinfer1::IFullyConnectedLayer* X_proj = network->addFullyConnected(*input_dim4->getOutput(0), node_dim, weightMap[lname + ".pre_head_proj.weight"], weightMap[lname + ".pre_head_proj.bias"]); // X_proj = self.pre_head_proj(X) nvinfer1::IShuffleLayer* X_heads = network->addShuffle(*X_proj->getOutput(0)); X_heads->setReshapeDimensions(nvinfer1::Dims4{B, N, num_heads, head_dim}); X_heads->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); // X_heads = X_proj.view(B, N, self.num_heads, self.head_dim).transpose(1, 2) nvinfer1::IShuffleLayer* proto_heads = network->addShuffle(*prototypes->getOutput(0)); proto_heads->setReshapeDimensions(nvinfer1::Dims4{B, num_hyperedges, num_heads, head_dim}); proto_heads->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); // proto_heads = prototypes.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(0, 2, 1, 3) nvinfer1::IShuffleLayer* X_heads_flat = network->addShuffle(*X_heads->getOutput(0)); X_heads_flat->setReshapeDimensions(nvinfer1::Dims3{B * num_heads, N, head_dim}); // X_heads_flat = X_heads.reshape(B * self.num_heads, N, self.head_dim) nvinfer1::IShuffleLayer* proto_heads_flat = network->addShuffle(*proto_heads->getOutput(0)); proto_heads_flat->setReshapeDimensions(nvinfer1::Dims3{B * num_heads, num_hyperedges, head_dim}); proto_heads_flat->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); //proto_heads_flat = proto_heads.reshape(B * self.num_heads, self.num_hyperedges, self.head_dim).transpose(1, 2) nvinfer1::IMatrixMultiplyLayer* logits = network->addMatrixMultiply(*X_heads_flat->getOutput(0), nvinfer1::MatrixOperation::kNONE, *proto_heads_flat->getOutput(0), nvinfer1::MatrixOperation::kNONE); float* scales_ptr = reinterpret_cast(malloc(sizeof(float))); *scales_ptr = sqrt(static_cast(head_dim)); nvinfer1::Weights scale_wts{nvinfer1::DataType::kFLOAT, scales_ptr, 1}; nvinfer1::IConstantLayer* scale_layer = network->addConstant(nvinfer1::Dims3{1, 1, 1}, scale_wts); // keep weight alive during build weightMap[lname + ".scaling"] = scale_wts; nvinfer1::IElementWiseLayer* logits_scale = network->addElementWise( *logits->getOutput(0), *scale_layer->getOutput(0), nvinfer1::ElementWiseOperation::kDIV); // logits = torch.bmm(X_heads_flat, proto_heads_flat) / self.scaling nvinfer1::IShuffleLayer* logits_scale_view = network->addShuffle(*logits_scale->getOutput(0)); logits_scale_view->setReshapeDimensions(nvinfer1::Dims4{B, num_heads, N, num_hyperedges}); nvinfer1::IReduceLayer* logits_scale_view_mean = network->addReduce(*logits_scale_view->getOutput(0), nvinfer1::ReduceOperation::kAVG, 1 << 1, false); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*logits_scale_view_mean->getOutput(0)); softmax->setAxes(1 << 1); return softmax; } nvinfer1::IElementWiseLayer* GELU(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input) { static float sqrt_2_over_pi = 0.797885f; // 0.7978845608 static float kappa = 0.044715f; static float one = 1.0f; static float half = 0.5f; nvinfer1::IElementWiseLayer* x3_layer = network->addElementWise(input, input, nvinfer1::ElementWiseOperation::kPROD); nvinfer1::ITensor* x2 = x3_layer->getOutput(0); x3_layer = network->addElementWise(*x2, input, nvinfer1::ElementWiseOperation::kPROD); nvinfer1::ITensor* x3 = x3_layer->getOutput(0); nvinfer1::Weights kappa_weight{nvinfer1::DataType::kFLOAT, &kappa, 1}; nvinfer1::IConstantLayer* kappa_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, kappa_weight); nvinfer1::IElementWiseLayer* scaled_x3 = network->addElementWise(*x3, *kappa_const->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* inner_sum = network->addElementWise(input, *scaled_x3->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::ITensor* inner = inner_sum->getOutput(0); nvinfer1::Weights sqrt_weight{nvinfer1::DataType::kFLOAT, &sqrt_2_over_pi, 1}; nvinfer1::IConstantLayer* sqrt_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, sqrt_weight); nvinfer1::IElementWiseLayer* scaled_inner = network->addElementWise(*inner, *sqrt_const->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IActivationLayer* tanh_layer = network->addActivation(*scaled_inner->getOutput(0), nvinfer1::ActivationType::kTANH); nvinfer1::Weights one_weight{nvinfer1::DataType::kFLOAT, &one, 1}; nvinfer1::IConstantLayer* one_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, one_weight); nvinfer1::IElementWiseLayer* add_one = network->addElementWise(*tanh_layer->getOutput(0), *one_const->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::IElementWiseLayer* half_x = network->addElementWise(input, *add_one->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::Weights half_weight{nvinfer1::DataType::kFLOAT, &half, 1}; nvinfer1::IConstantLayer* half_const = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, half_weight); nvinfer1::IElementWiseLayer* gelu = network->addElementWise(*half_x->getOutput(0), *half_const->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); return gelu; } nvinfer1::IElementWiseLayer* AdaHGConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int embed_dim, std::string lname, int num_hyperedges, int num_heads, std::string context) { // {B, N, num_hyperedges} nvinfer1::ISoftMaxLayer* A = AdaHyperedgeGen(network, weightMap, input, embed_dim, num_hyperedges, lname + ".edge_generator", num_heads, context); nvinfer1::IMatrixMultiplyLayer* He = network->addMatrixMultiply( // 486 layer *A->getOutput(0), nvinfer1::MatrixOperation::kTRANSPOSE, input, nvinfer1::MatrixOperation::kNONE); nvinfer1::IShuffleLayer* He_dim4 = network->addShuffle(*He->getOutput(0)); He_dim4->setReshapeDimensions(nvinfer1::Dims4{He->getOutput(0)->getDimensions().d[1], He->getOutput(0)->getDimensions().d[0], He->getOutput(0)->getDimensions().d[2], 1}); nvinfer1::IFullyConnectedLayer* He_edge_proj_ = network->addFullyConnected(*He_dim4->getOutput(0), embed_dim, weightMap[lname + ".edge_proj.0.weight"], weightMap[lname + ".edge_proj.0.bias"]); nvinfer1::IElementWiseLayer* He_edge_proj = GELU(network, *He_edge_proj_->getOutput(0)); nvinfer1::IShuffleLayer* He_edge_proj_dim2 = network->addShuffle(*He_edge_proj->getOutput(0)); He_edge_proj_dim2->setReshapeDimensions(nvinfer1::Dims2{He_edge_proj->getOutput(0)->getDimensions().d[0], He_edge_proj->getOutput(0)->getDimensions().d[1]}); nvinfer1::IShuffleLayer* A_dim2 = network->addShuffle(*A->getOutput(0)); A_dim2->setReshapeDimensions( nvinfer1::Dims2{A->getOutput(0)->getDimensions().d[1] * A->getOutput(0)->getDimensions().d[0], // keep the batch information A->getOutput(0)->getDimensions().d[2]}); nvinfer1::IMatrixMultiplyLayer* x_new_ = network->addMatrixMultiply(*A_dim2->getOutput(0), nvinfer1::MatrixOperation::kNONE, *He_edge_proj_dim2->getOutput(0), nvinfer1::MatrixOperation::kNONE); nvinfer1::IShuffleLayer* x_new_dim4 = network->addShuffle(*x_new_->getOutput(0)); x_new_dim4->setReshapeDimensions(nvinfer1::Dims4{x_new_->getOutput(0)->getDimensions().d[0], x_new_->getOutput(0)->getDimensions().d[1], 1, 1}); nvinfer1::IFullyConnectedLayer* x_new_node_proj_ = network->addFullyConnected(*x_new_dim4->getOutput(0), embed_dim, weightMap[lname + ".node_proj.0.weight"], weightMap[lname + ".node_proj.0.bias"]); nvinfer1::IElementWiseLayer* x_new_node_proj = GELU(network, *x_new_node_proj_->getOutput(0)); nvinfer1::IShuffleLayer* x_new_finall = network->addShuffle(*x_new_node_proj->getOutput(0)); x_new_finall->setReshapeDimensions(nvinfer1::Dims3{1, x_new_node_proj->getOutput(0)->getDimensions().d[0], x_new_node_proj->getOutput(0)->getDimensions().d[1]}); nvinfer1::IElementWiseLayer* add = network->addElementWise(*x_new_finall->getOutput(0), input, nvinfer1::ElementWiseOperation::kSUM); return add; } nvinfer1::IShuffleLayer* AdaHGComputation(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int embed_dim, std::string lname, int num_hyperedges, int num_heads, std::string context) { nvinfer1::Dims dim = input.getDimensions(); int B = dim.d[0]; int C = dim.d[1]; int H = dim.d[2]; int W = dim.d[3]; nvinfer1::IShuffleLayer* tokens = network->addShuffle(input); tokens->setReshapeDimensions(nvinfer1::Dims3{B, C, H * W}); tokens->setSecondTranspose(nvinfer1::Permutation{0, 2, 1}); nvinfer1::IElementWiseLayer* hgnn = AdaHGConv(network, weightMap, *tokens->getOutput(0), embed_dim, lname + ".hgnn", num_hyperedges, num_heads, context); nvinfer1::IShuffleLayer* x_out = network->addShuffle(*hgnn->getOutput(0)); x_out->setFirstTranspose(nvinfer1::Permutation{0, 2, 1}); x_out->setReshapeDimensions(nvinfer1::Dims4{B, C, H, W}); return x_out; } nvinfer1::ILayer* C3AH(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c2, std::string lname, float e, int num_hyperedges, std::string context) { int c_ = float(e) * c2; assert(c_ % 16 == 0 && "Dimension of AdaHGComputation should be a multiplt of 16"); int num_heads = c_ / 16; nvinfer1::ILayer* cv1 = Conv(network, weightMap, input, c_, lname + ".cv1"); nvinfer1::ILayer* cv2 = Conv(network, weightMap, input, c_, lname + ".cv2"); nvinfer1::IShuffleLayer* m = AdaHGComputation(network, weightMap, *cv1->getOutput(0), c_, lname + ".m", num_hyperedges, num_heads, context); nvinfer1::ITensor* inputTensor[] = {m->getOutput(0), cv2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor, 2); nvinfer1::ILayer* cv3 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv3"); return cv3; } nvinfer1::ILayer* HyperACE(nvinfer1::INetworkDefinition* network, std::map weightMap, std::vector input, int c1, int c2, std::string lname, int n, int num_hyperedges, bool dsc3k, bool shortcut, float e1, float e2, std::string context, bool channel_adjust) { int c = int(c2 * e1); nvinfer1::ILayer* fuse = FuseModule(network, weightMap, input, c1, channel_adjust, lname + ".fuse"); nvinfer1::ILayer* cv1 = Conv(network, weightMap, *fuse->getOutput(0), 3 * c, lname + ".cv1"); nvinfer1::Dims d_cv1 = cv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* sl0 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, nvinfer1::Dims4{d_cv1.d[0], d_cv1.d[1] / 3, d_cv1.d[2], d_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* sl1 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims4{0, d_cv1.d[1] / 3, 0, 0}, nvinfer1::Dims4{d_cv1.d[0], d_cv1.d[1] / 3, d_cv1.d[2], d_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); nvinfer1::ISliceLayer* sl2 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims4{0, d_cv1.d[1] / 3 * 2, 0, 0}, nvinfer1::Dims4{d_cv1.d[0], d_cv1.d[1] / 3, d_cv1.d[2], d_cv1.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); std::vector y = {sl0->getOutput(0), sl1->getOutput(0), sl2->getOutput(0)}; nvinfer1::ILayer* out1 = C3AH(network, weightMap, *y[1], c, lname + ".branch1", e2, num_hyperedges, context); nvinfer1::ILayer* out2 = C3AH(network, weightMap, *y[1], c, lname + ".branch2", e2, num_hyperedges, context); nvinfer1::ITensor* current = y[2]; for (int i = 0; i < n; i++) { if (dsc3k) { nvinfer1::ILayer* m_ = DSC3k(network, weightMap, *current, c, 2, lname + ".m." + std::to_string(i), shortcut, 1, 0.5, 3, 7, 1); current = m_->getOutput(0); } else { nvinfer1::ILayer* m_ = DSBottleneck(network, weightMap, *current, c, c, lname + ".m." + std::to_string(i), shortcut); current = m_->getOutput(0); } y.push_back(current); } y[1] = out1->getOutput(0); y.push_back(out2->getOutput(0)); nvinfer1::IConcatenationLayer* cat = network->addConcatenation(y.data(), y.size()); nvinfer1::ILayer* cv2 = Conv(network, weightMap, *cat->getOutput(0), c2, lname + ".cv2"); return cv2; } nvinfer1::ILayer* DownsampleConv(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int in_channels, std::string lname, bool channel_adjust) { nvinfer1::IPoolingLayer* downsample = network->addPoolingNd(input, nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{2, 2}); downsample->setStrideNd(nvinfer1::DimsHW{2, 2}); downsample->setPaddingNd(nvinfer1::DimsHW{0, 0}); if (channel_adjust) { nvinfer1::ILayer* channel_adjust_ = Conv(network, weightMap, *downsample->getOutput(0), in_channels * 2, lname + ".channel_adjust"); return channel_adjust_; } else return downsample; } nvinfer1::IElementWiseLayer* FullPad_Tunnel(nvinfer1::INetworkDefinition* network, std::map weightMap, std::vector input, std::string lname) { nvinfer1::Weights gate = weightMap[lname + ".gate"]; nvinfer1::IConstantLayer* gate_constant = network->addConstant(nvinfer1::Dims4{1, 1, 1, 1}, gate); nvinfer1::IElementWiseLayer* scaled_input_1 = network->addElementWise(*input[1], *gate_constant->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); nvinfer1::IElementWiseLayer* add = network->addElementWise(*input[0], *scaled_input_1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return add; } ================================================ FILE: yolov13/src/calibrator.cpp ================================================ #include "calibrator.h" #include #include #include #include #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov13/src/model.cpp ================================================ #include #include #include "block.h" #include "calibrator.h" #include "config.h" #include "model.h" static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = std::min(x, max_channels); channel = int(ceil((channel * gw) / divisor)) * divisor; return channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } // Unused functions removed: convBnSiLUProto, Proto, cv4_conv_combined void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } void calculateStrides(nvinfer1::ILayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[2]; strides[i] = reference_size / feature_map_size; } } nvinfer1::IHostMemory* buildEngineYolov13Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels, std::string& type) { std::cout << "The number of the KNumClass is " << kNumClass << std::endl; std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2( 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); // ===================== input =================================================== nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); assert(data); // ===================== backbone =================================================== nvinfer1::ILayer* conv0 = Conv(network, weightMap, *data, get_width(64, gw, max_channels), "model.0", 3, 2); nvinfer1::ILayer* conv1 = Conv(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), "model.1", 3, 2, 1, 2); bool dsc3k = false; float mlp_ratio = 2.0; bool residual = false; bool channel_adjust = true; if (type == "l" || type == "x") { mlp_ratio = 1.5; residual = true; dsc3k = true; channel_adjust = false; } nvinfer1::ILayer* conv2 = DSC3K2(network, weightMap, *conv1->getOutput(0), get_width(256, gw, max_channels), "model.2", get_depth(2, gd), dsc3k, 0.25); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 2, "model.3", 1, 4); nvinfer1::ILayer* conv4 = DSC3K2(network, weightMap, *conv3->getOutput(0), get_width(512, gw, max_channels), "model.4", get_depth(2, gd), dsc3k, 0.25); nvinfer1::IElementWiseLayer* conv5 = DSConv(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), "model.5", 3, 2); nvinfer1::ILayer* conv6 = A2C2f(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_depth(4, gd), "model.6", true, 4, residual, mlp_ratio); nvinfer1::IElementWiseLayer* conv7 = DSConv(network, weightMap, *conv6->getOutput(0), get_width(512, gw, max_channels), get_width(1024, gw, max_channels), "model.7", 3, 2); nvinfer1::ILayer* conv8 = A2C2f(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_depth(4, gd), "model.8", true, 1, residual, mlp_ratio); //========================= neck ==================================================================== float scale[] = {1.0, 1.0, 2.0, 2.0}; int num_hyperedges = 8; if (type == "n") { num_hyperedges *= 0.5; } else if (type == "x") { num_hyperedges *= 1.5; } nvinfer1::ILayer* conv9 = HyperACE(network, weightMap, {conv4->getOutput(0), conv6->getOutput(0), conv8->getOutput(0)}, get_width(512, gw, max_channels), get_width(512, gw, max_channels), "model.9", get_depth(2, gd), num_hyperedges, true, true, 0.5, 1, "both", channel_adjust); auto input_dims = conv9->getOutput(0)->getDimensions(); nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); assert(upsample10); upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample10->setOutputDimensions( nvinfer1::Dims4{input_dims.d[0], input_dims.d[1], input_dims.d[2] * 2, input_dims.d[3] * 2}); nvinfer1::ILayer* downsample11 = DownsampleConv(network, weightMap, *conv9->getOutput(0), get_width(512, gw, max_channels), "model.11", channel_adjust); nvinfer1::IElementWiseLayer* conv12 = // conv6:(1, 128, 40, 40) conv9: (1, 128, 40, 40) FullPad_Tunnel(network, weightMap, {conv6->getOutput(0), conv9->getOutput(0)}, "model.12"); nvinfer1::IElementWiseLayer* conv13 = FullPad_Tunnel(network, weightMap, {conv4->getOutput(0), upsample10->getOutput(0)}, "model.13"); nvinfer1::IElementWiseLayer* conv14 = FullPad_Tunnel(network, weightMap, {conv8->getOutput(0), downsample11->getOutput(0)}, "model.14"); nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0)); assert(upsample15); upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample15->setScales(scale, 4); nvinfer1::ITensor* inputTensors16[] = {upsample15->getOutput(0), conv12->getOutput(0)}; nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensors16, 2); nvinfer1::ILayer* conv17 = DSC3K2(network, weightMap, *cat16->getOutput(0), get_width(512, gw, max_channels), "model.17", get_depth(2, gd), true); nvinfer1::IElementWiseLayer* conv18 = FullPad_Tunnel(network, weightMap, {conv17->getOutput(0), conv9->getOutput(0)}, "model.18"); nvinfer1::IResizeLayer* upsample19 = network->addResize(*conv17->getOutput(0)); assert(upsample19); upsample19->setScales(scale, 4); upsample19->setResizeMode(nvinfer1::ResizeMode::kNEAREST); nvinfer1::ITensor* inputTensors20[] = {upsample19->getOutput(0), conv13->getOutput(0)}; nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensors20, 2); nvinfer1::ILayer* conv21 = DSC3K2(network, weightMap, *cat20->getOutput(0), get_width(256, gw, max_channels), "model.21", get_depth(2, gd), true); nvinfer1::ILayer* conv22 = Conv(network, weightMap, *upsample10->getOutput(0), get_width(256, gw, max_channels), "model.22"); nvinfer1::IElementWiseLayer* conv23 = FullPad_Tunnel(network, weightMap, {conv21->getOutput(0), conv22->getOutput(0)}, "model.23"); nvinfer1::ILayer* conv24 = Conv(network, weightMap, *conv23->getOutput(0), get_width(256, gw, max_channels), "model.24", 3, 2); nvinfer1::ITensor* inputTensors25[] = {conv24->getOutput(0), conv18->getOutput(0)}; nvinfer1::IConcatenationLayer* cat25 = network->addConcatenation(inputTensors25, 2); nvinfer1::ILayer* conv26 = DSC3K2(network, weightMap, *cat25->getOutput(0), get_width(512, gw, max_channels), "model.26", get_depth(2, gd), true); nvinfer1::IElementWiseLayer* conv27 = FullPad_Tunnel(network, weightMap, {conv26->getOutput(0), conv9->getOutput(0)}, "model.27"); nvinfer1::ILayer* conv28 = Conv(network, weightMap, *conv26->getOutput(0), get_width(512, gw, max_channels), "model.28", 3, 2); nvinfer1::ITensor* inputTensors29[] = {conv28->getOutput(0), conv14->getOutput(0)}; nvinfer1::IConcatenationLayer* cat29 = network->addConcatenation(inputTensors29, 2); nvinfer1::ILayer* conv30 = DSC3K2(network, weightMap, *cat29->getOutput(0), get_width(1024, gw, max_channels), "model.30", get_depth(2, gd), true); nvinfer1::IElementWiseLayer* conv31 = FullPad_Tunnel(network, weightMap, {conv30->getOutput(0), downsample11->getOutput(0)}, "model.31"); // =============================== output =================================================================== int c2 = std::max(std::max(16, get_width(256, gw, max_channels) / 4), 16 * 4); int c3 = std::max(get_width(256, gw, max_channels), std::min(kNumClass, 100)); // output0 location nvinfer1::IElementWiseLayer* conv32_cv2_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.0.0"); nvinfer1::IElementWiseLayer* conv32_cv2_0_1 = convBnSiLU(network, weightMap, *conv32_cv2_0_0->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.0.1"); nvinfer1::IConvolutionLayer* conv32_cv2_0_2 = network->addConvolutionNd(*conv32_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.32.cv2.0.2.weight"], weightMap["model.32.cv2.0.2.bias"]); conv32_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv32_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // output0 classes auto* conv32_cv3_0_0_0 = DWConv(network, weightMap, *conv23->getOutput(0), get_width(256, gw, max_channels), {3, 3}, 1, "model.32.cv3.0.0.0"); nvinfer1::IElementWiseLayer* conv32_cv3_0_0_1 = convBnSiLU(network, weightMap, *conv32_cv3_0_0_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.0.0.1"); auto* conv32_cv3_0_1_0 = DWConv(network, weightMap, *conv32_cv3_0_0_1->getOutput(0), c3, {3, 3}, 1, "model.32.cv3.0.1.0"); nvinfer1::IElementWiseLayer* conv32_cv3_0_1_1 = convBnSiLU(network, weightMap, *conv32_cv3_0_1_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.0.1.1"); nvinfer1::IConvolutionLayer* conv32_cv3_0_1_2 = network->addConvolutionNd(*conv32_cv3_0_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.32.cv3.0.2.weight"], weightMap["model.32.cv3.0.2.bias"]); conv32_cv3_0_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv32_cv3_0_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::ITensor* inputTensors32_0[] = {conv32_cv2_0_2->getOutput(0), conv32_cv3_0_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat32_0 = network->addConcatenation(inputTensors32_0, 2); // out1 location nvinfer1::IElementWiseLayer* conv32_cv2_1_0 = convBnSiLU(network, weightMap, *conv27->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.1.0"); nvinfer1::IElementWiseLayer* conv32_cv2_1_1 = convBnSiLU(network, weightMap, *conv32_cv2_1_0->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.1.1"); nvinfer1::IConvolutionLayer* conv32_cv2_1_2 = network->addConvolutionNd(*conv32_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.32.cv2.1.2.weight"], weightMap["model.32.cv2.1.2.bias"]); conv32_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv32_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // out1 classes auto* conv32_cv3_1_0_0 = DWConv(network, weightMap, *conv27->getOutput(0), get_width(512, gw, max_channels), {3, 3}, 1, "model.32.cv3.1.0.0"); nvinfer1::IElementWiseLayer* conv32_cv3_1_0_1 = convBnSiLU(network, weightMap, *conv32_cv3_1_0_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.1.0.1"); auto* conv32_cv3_1_1_0 = DWConv(network, weightMap, *conv32_cv3_1_0_1->getOutput(0), c3, {3, 3}, 1, "model.32.cv3.1.1.0"); nvinfer1::IElementWiseLayer* conv32_cv3_1_1_1 = convBnSiLU(network, weightMap, *conv32_cv3_1_1_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.1.1.1"); nvinfer1::IConvolutionLayer* conv32_cv3_1_1_2 = network->addConvolutionNd(*conv32_cv3_1_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.32.cv3.1.2.weight"], weightMap["model.32.cv3.1.2.bias"]); conv32_cv3_1_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); conv32_cv3_1_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::ITensor* inputTensors32_1[] = {conv32_cv2_1_2->getOutput(0), conv32_cv3_1_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat32_1 = network->addConcatenation(inputTensors32_1, 2); // out2 location nvinfer1::IElementWiseLayer* conv32_cv2_2_0 = convBnSiLU(network, weightMap, *conv31->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.2.0"); nvinfer1::IElementWiseLayer* conv32_cv2_2_1 = convBnSiLU(network, weightMap, *conv32_cv2_2_0->getOutput(0), c2, {3, 3}, 1, "model.32.cv2.2.1"); nvinfer1::IConvolutionLayer* conv32_cv2_2_2 = network->addConvolutionNd(*conv32_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.32.cv2.2.2.weight"], weightMap["model.32.cv2.2.2.bias"]); conv32_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv32_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); // out2 classes auto* conv32_cv3_2_0_0 = DWConv(network, weightMap, *conv31->getOutput(0), get_width(1024, gw, max_channels), {3, 3}, 1, "model.32.cv3.2.0.0"); nvinfer1::IElementWiseLayer* conv32_cv3_2_0_1 = convBnSiLU(network, weightMap, *conv32_cv3_2_0_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.2.0.1"); auto* conv32_cv3_2_1_0 = DWConv(network, weightMap, *conv32_cv3_2_0_1->getOutput(0), c3, {3, 3}, 1, "model.32.cv3.2.1.0"); nvinfer1::IElementWiseLayer* conv32_cv3_2_1_1 = convBnSiLU(network, weightMap, *conv32_cv3_2_1_0->getOutput(0), c3, {1, 1}, 1, "model.32.cv3.2.1.1"); nvinfer1::IConvolutionLayer* conv32_cv3_2_1_2 = network->addConvolutionNd(*conv32_cv3_2_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.32.cv3.2.2.weight"], weightMap["model.32.cv3.2.2.bias"]); conv32_cv3_2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv32_cv3_2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor32_2[] = {conv32_cv2_2_2->getOutput(0), conv32_cv3_2_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat32_2 = network->addConcatenation(inputTensor32_2, 2); // ============================================ yolov13 detect ========================================= nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle32_0 = network->addShuffle(*cat32_0->getOutput(0)); shuffle32_0->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split32_0_0 = network->addSlice( *shuffle32_0->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split32_0_1 = network->addSlice(*shuffle32_0->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl32_0 = DFL(network, weightMap, *split32_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.32.dfl.conv.weight"); nvinfer1::ITensor* inputTensor32_dfl_0[] = {dfl32_0->getOutput(0), split32_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat32_dfl_0 = network->addConcatenation(inputTensor32_dfl_0, 2); cat32_dfl_0->setAxis(1); nvinfer1::IShuffleLayer* shuffle32_1 = network->addShuffle(*cat32_1->getOutput(0)); shuffle32_1->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split32_1_0 = network->addSlice( *shuffle32_1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split32_1_1 = network->addSlice(*shuffle32_1->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl32_1 = DFL(network, weightMap, *split32_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.32.dfl.conv.weight"); nvinfer1::ITensor* inputTensor32_dfl_1[] = {dfl32_1->getOutput(0), split32_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat32_dfl_1 = network->addConcatenation(inputTensor32_dfl_1, 2); cat32_dfl_1->setAxis(1); nvinfer1::IShuffleLayer* shuffle32_2 = network->addShuffle(*cat32_2->getOutput(0)); shuffle32_2->setReshapeDimensions( nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split32_2_0 = network->addSlice( *shuffle32_2->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split32_2_1 = network->addSlice(*shuffle32_2->getOutput(0), nvinfer1::Dims3{0, 64, 0}, nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::IShuffleLayer* dfl32_2 = DFL(network, weightMap, *split32_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.32.dfl.conv.weight"); nvinfer1::ITensor* inputTensor32_dfl_2[] = {dfl32_2->getOutput(0), split32_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat32_dfl_2 = network->addConcatenation(inputTensor32_dfl_2, 2); cat32_dfl_2->setAxis(1); std::cout << " There are " << weightMap.size() << " layers parameters in the network!!!" << endl; nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat32_dfl_0, cat32_dfl_1, cat32_dfl_2}, strides, stridesLength); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(kBatchSize, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov13/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), (std::min)(lbox[2], rbox[2]), (std::max)(lbox[1], rbox[1]), (std::min)(lbox[3], rbox[3]), }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; return interBoxS / unionBoxS; } static bool cmp(const Detection& a, const Detection& b) { if (a.conf == b.conf) { return a.bbox[0] < b.bbox[0]; } return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4])) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; res.push_back(det); } } } void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } ================================================ FILE: yolov13/src/postprocess.cu ================================================ // // Created by lindsay on 23-7-17. // #include "postprocess.h" #include "types.h" static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); char* pout_item_char = (char*)parray + sizeof(float) + index * bbox_element * sizeof(float); float* pout_item = (float*)pout_item_char; // Wait, let's look at how parray is used. // In original code: // float* pout_item = parray + 1 + index * bbox_element; // But parray[0] is count. So parray + 1 is start of data. // Ensure this matches usage in nms_kernel. if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; float left = pitem[0]; float top = pitem[1]; float right = pitem[2]; float bottom = pitem[3]; float label = pitem[5]; // Re-verify pointer arithmetic. // parray is float*. 1 is float size. // index * bbox_element is float offset. float* out_ptr = parray + 1 + index * bbox_element; *out_ptr++ = left; *out_ptr++ = top; *out_ptr++ = right; *out_ptr++ = bottom; *out_ptr++ = confidence; *out_ptr++ = label; *out_ptr++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min((int)*bboxes, max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel<<>>(parray, max_objects, nms_threshold); } ================================================ FILE: yolov13/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov13/yolov13_det.cpp ================================================ #include #include #include #if defined(_WIN32) #include #include #include #else #include #include #endif #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; static std::string get_executable_dir() { #if defined(_WIN32) char buf[MAX_PATH]; DWORD len = GetModuleFileNameA(NULL, buf, MAX_PATH); if (len == 0 || len == MAX_PATH) return std::string("."); std::string path(buf, buf + len); size_t pos = path.find_last_of("\\/"); if (pos != std::string::npos) return path.substr(0, pos); return std::string("."); #else char buf[PATH_MAX]; ssize_t len = readlink("/proc/self/exe", buf, sizeof(buf) - 1); if (len == -1) return std::string("."); buf[len] = '\0'; std::string path(buf); size_t pos = path.find_last_of('/'); if (pos != std::string::npos) return path.substr(0, pos); return std::string("."); #endif } void serialize_engine(std::string& wts_name, std::string& engine_name, float& gd, float& gw, int& max_channels, std::string& type) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolov13Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels, type); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.50; gw = 0.25; max_channels = 1024; type = "n"; } else if (sub_type[0] == 's') { gd = 0.50; gw = 0.50; max_channels = 1024; type = "s"; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; type = "l"; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.50; max_channels = 512; type = "x"; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name; std::string engine_name; std::string img_dir; std::string cuda_post_process; std::string type; int model_bboxes; float gd = 0, gw = 0; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov13-det -s [.wts] [.engine] [n/s/l/x] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov13-det -d [.engine] ../images [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, gd, gw, max_channels, type); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); // Save the first 100 values of output_buffer_host, one per line // std::ofstream out("../models/output.txt"); // for (int j = 0; j < 100; j++) { // out << output_buffer_host[j] << std::endl; // } // out.close(); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox(img_batch, res_batch); #if 0 // legacy: save under a "build" subfolder of the working directory const std::string out_dir = "build"; #else // Save results to the directory where the executable resides const std::string exe_dir = get_executable_dir(); const std::string out_dir = exe_dir; #endif #if defined(_WIN32) if (_access(out_dir.c_str(), 0) != 0) { if (_mkdir(out_dir.c_str()) != 0) { std::cerr << "Warning: create directory failed: " << out_dir << std::endl; } } #else if (access(out_dir.c_str(), F_OK) != 0) { if (mkdir(out_dir.c_str(), 0755) != 0) { std::cerr << "Warning: create directory failed: " << out_dir << std::endl; } } #endif for (size_t j = 0; j < img_batch.size(); j++) { std::string out_path = out_dir + "/_" + img_name_batch[j]; if (cv::imwrite(out_path, img_batch[j])) { std::cout << "Saved: " << out_path << std::endl; } else { std::cerr << "Failed to save: " << out_path << std::endl; } } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov13/yolov13_det_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 DET_NUM = 6 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov13 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov13TRT(object): """ description: A YOLOv13 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) self.batch_size = engine.get_binding_shape(binding)[0] size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # print("output: ", output[400:500]) # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM # Get the num of boxes detected num = int(output[0]) print("There are {} detections in the picture!!!".format(num)) # Reshape to a two dimentional ndarray # pred = np.reshape(output[1:], (-1, 38))[:num, :] pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov13_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov13_wrapper = yolov13_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov13_wrapper.infer( self.yolov13_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov13_wrapper): threading.Thread.__init__(self) self.yolov13_wrapper = yolov13_wrapper def run(self): batch_image_raw, use_time = self.yolov13_wrapper.infer(self.yolov13_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/yolov13n-det.engine" # engine_file_path = "build/yolov13n-det-int8.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels # categories = ["object"] categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov13TRT instance yolov13_wrapper = YoLov13TRT(engine_file_path) try: print('batch size is', yolov13_wrapper.batch_size) image_dir = "images" image_path_batches = get_img_path_batches(yolov13_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov13_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov13_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov13_wrapper.destroy() ================================================ FILE: yolov3/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolov3) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu) cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) target_link_libraries(yololayer nvinfer cudart ${OpenCV_LIBS}) add_executable(yolov3 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/yolov3.cpp) target_link_libraries(yolov3 nvinfer) target_link_libraries(yolov3 cudart) target_link_libraries(yolov3 yololayer) target_link_libraries(yolov3 ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: yolov3/README.md ================================================ # yolov3 The Pytorch implementation is [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). It provides two trained weights of yolov3, `yolov3.weights` and `yolov3.pt` This branch is using tensorrt7 API, there is also a yolov3 implementation using tensorrt4 API, go to [branch trt4/yolov3](https://github.com/wang-xinyu/tensorrtx/tree/trt4/yolov3), which is using [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3). ## Config - Input shape defined in yololayer.h - Number of classes defined in yololayer.h - INT8/FP16/FP32 can be selected by the macro in yolov3.cpp - GPU id can be selected by the macro in yolov3.cpp - NMS thresh in yolov3.cpp - BBox confidence thresh in yolov3.cpp ## How to run 1. generate yolov3.wts from pytorch implementation with yolov3.cfg and yolov3.weights, or download .wts from model zoo ``` git clone https://github.com/wang-xinyu/tensorrtx.git git clone -b archive https://github.com/ultralytics/yolov3.git // download its weights 'yolov3.pt' or 'yolov3.weights' cp {tensorrtx}/yolov3/gen_wts.py {ultralytics/yolov3/} cd {ultralytics/yolov3/} python gen_wts.py yolov3.weights // a file 'yolov3.wts' will be generated. // the master branch of yolov3 should work, if not, you can checkout cf7a4d31d37788023a9186a1a143a2dab0275ead ``` 2. put yolov3.wts into tensorrtx/yolov3, build and run ``` mv yolov3.wts {tensorrtx}/yolov3/ cd {tensorrtx}/yolov3 mkdir build cd build cmake .. make sudo ./yolov3 -s // serialize model to plan file i.e. 'yolov3.engine' sudo ./yolov3 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed. ``` 3. check the images generated, as follows. _zidane.jpg and _bus.jpg # INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolov3/build 3. set the macro `USE_INT8` in yolov3.cpp and make 4. serialize the model and test

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov3/calibrator.cpp ================================================ #include #include #include #include #include "calibrator.h" #include "cuda_runtime_api.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()){ std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov3/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include "NvInfer.h" #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolov3/gen_wts.py ================================================ import struct import sys import torch from models import * # noqa: F403 from utils.utils import * # noqa: F403 model = Darknet('cfg/yolov3.cfg', (608, 608)) # noqa: F405 weights = sys.argv[1] device = torch_utils.select_device('0') # noqa: F405 if weights.endswith('.pt'): # pytorch format model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model']) else: # darknet format load_darknet_weights(model, weights) # noqa: F405 model = model.eval() with open('yolov3.wts', 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov3/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #else #define TRT_NOEXCEPT #endif using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov3/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov3/utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols*1.0); float r_h = input_h / (img.rows*1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } #endif ================================================ FILE: yolov3/yololayer.cu ================================================ #include "yololayer.h" #include "utils.h" #include using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin() { mClassCount = CLASS_NUM; mYoloKernel.clear(); mYoloKernel.push_back(yolo1); mYoloKernel.push_back(yolo2); mYoloKernel.push_back(yolo3); mKernelCount = mYoloKernel.size(); } YoloLayerPlugin::~YoloLayerPlugin() { } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(mYoloKernel.data(),d,kernelSize); d += kernelSize; assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(d,mYoloKernel.data(),kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin *p = new YoloLayerPlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output,int noElements, int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid*bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < 3; ++k) { int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue; float *res_count = output + bnIdx*outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= MAX_OUTPUT_BBOX_COUNT) return; char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection); Detection* det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k]; det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1]; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } } void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { void* devAnchor; size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen)); int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); for(int idx = 0 ; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); } int numElem = 0; for (unsigned int i = 0;i< mYoloKernel.size();++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width*yolo.height*batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem); } CUDA_CHECK(cudaFree(devAnchor)); } int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { YoloLayerPlugin* obj = new YoloLayerPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call MishPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov3/yololayer.h ================================================ #ifndef _YOLO_LAYER_H #define _YOLO_LAYER_H #include #include #include "NvInfer.h" #include "macros.h" namespace Yolo { static constexpr int CHECK_COUNT = 3; static constexpr float IGNORE_THRESH = 0.1f; static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 80; static constexpr int INPUT_H = 608; static constexpr int INPUT_W = 608; struct YoloKernel { int width; int height; float anchors[CHECK_COUNT*2]; }; static constexpr YoloKernel yolo1 = { INPUT_W / 32, INPUT_H / 32, {116,90, 156,198, 373,326} }; static constexpr YoloKernel yolo2 = { INPUT_W / 16, INPUT_H / 16, {30,61, 62,45, 59,119} }; static constexpr YoloKernel yolo3 = { INPUT_W / 8, INPUT_H / 8, {10,13, 16,30, 33,23} }; static constexpr int LOCATIONS = 4; struct alignas(float) Detection{ //x y w h float bbox[LOCATIONS]; float det_confidence; float class_id; float class_confidence; }; } namespace nvinfer1 { class YoloLayerPlugin: public IPluginV2IOExt { public: explicit YoloLayerPlugin(); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); int mClassCount; int mKernelCount; std::vector mYoloKernel; int mThreadCount = 256; const char* mPluginNamespace; }; class YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif ================================================ FILE: yolov3/yolov3.cpp ================================================ #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "utils.h" #include "logging.h" #include "yololayer.h" #include "calibrator.h" #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.4 #define BBOX_CONF_THRESH 0.5 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float); static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; cv::Rect get_rect(cv::Mat& img, float bbox[4]) { int l, r, t, b; float r_w = INPUT_W / (img.cols * 1.0); float r_h = INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2]/2.f; r = bbox[0] + bbox[2]/2.f; t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; t = bbox[1] - bbox[3]/2.f; b = bbox[1] + bbox[3]/2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(l, t, r-l, b-t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.det_confidence > b.det_confidence; } void nms(std::vector& res, float *output, float nms_thresh = NMS_THRESH) { std::map> m; for (int i = 0; i < output[0] && i < 1000; i++) { if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue; Yolo::Detection det; memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); return lr; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../yolov3.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // Yeah I am stupid, I just want to expand the complete arch of darknet.. auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1); auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2); auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3); auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM); auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5); auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6); auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7); auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM); auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9); auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10); auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM); auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12); auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13); auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14); auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM); auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16); auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17); auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM); auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19); auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20); auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM); auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22); auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23); auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM); auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25); auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26); auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM); auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28); auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29); auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM); auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31); auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32); auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM); auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34); auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35); auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM); auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37); auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38); auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39); auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM); auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41); auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42); auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM); auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44); auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45); auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM); auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47); auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48); auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM); auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50); auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51); auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM); auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53); auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54); auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM); auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56); auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57); auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM); auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59); auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60); auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM); auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62); auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63); auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64); auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM); auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66); auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67); auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM); auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69); auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70); auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM); auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72); auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73); auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM); auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75); auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76); auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77); auto lr78 = convBnLeaky(network, weightMap, *lr77->getOutput(0), 1024, 3, 1, 1, 78); auto lr79 = convBnLeaky(network, weightMap, *lr78->getOutput(0), 512, 1, 1, 0, 79); auto lr80 = convBnLeaky(network, weightMap, *lr79->getOutput(0), 1024, 3, 1, 1, 80); IConvolutionLayer* conv81 = network->addConvolutionNd(*lr80->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.81.Conv2d.weight"], weightMap["module_list.81.Conv2d.bias"]); assert(conv81); // 82 is yolo auto l83 = lr79; auto lr84 = convBnLeaky(network, weightMap, *l83->getOutput(0), 256, 1, 1, 0, 84); float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); for (int i = 0; i < 256 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts85{DataType::kFLOAT, deval, 256 * 2 * 2}; IDeconvolutionLayer* deconv85 = network->addDeconvolutionNd(*lr84->getOutput(0), 256, DimsHW{2, 2}, deconvwts85, emptywts); assert(deconv85); deconv85->setStrideNd(DimsHW{2, 2}); deconv85->setNbGroups(256); weightMap["deconv85"] = deconvwts85; ITensor* inputTensors[] = {deconv85->getOutput(0), ew61->getOutput(0)}; auto cat86 = network->addConcatenation(inputTensors, 2); auto lr87 = convBnLeaky(network, weightMap, *cat86->getOutput(0), 256, 1, 1, 0, 87); auto lr88 = convBnLeaky(network, weightMap, *lr87->getOutput(0), 512, 3, 1, 1, 88); auto lr89 = convBnLeaky(network, weightMap, *lr88->getOutput(0), 256, 1, 1, 0, 89); auto lr90 = convBnLeaky(network, weightMap, *lr89->getOutput(0), 512, 3, 1, 1, 90); auto lr91 = convBnLeaky(network, weightMap, *lr90->getOutput(0), 256, 1, 1, 0, 91); auto lr92 = convBnLeaky(network, weightMap, *lr91->getOutput(0), 512, 3, 1, 1, 92); IConvolutionLayer* conv93 = network->addConvolutionNd(*lr92->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.93.Conv2d.weight"], weightMap["module_list.93.Conv2d.bias"]); assert(conv93); // 94 is yolo auto l95 = lr91; auto lr96 = convBnLeaky(network, weightMap, *l95->getOutput(0), 128, 1, 1, 0, 96); Weights deconvwts97{DataType::kFLOAT, deval, 128 * 2 * 2}; IDeconvolutionLayer* deconv97 = network->addDeconvolutionNd(*lr96->getOutput(0), 128, DimsHW{2, 2}, deconvwts97, emptywts); assert(deconv97); deconv97->setStrideNd(DimsHW{2, 2}); deconv97->setNbGroups(128); ITensor* inputTensors1[] = {deconv97->getOutput(0), ew36->getOutput(0)}; auto cat98 = network->addConcatenation(inputTensors1, 2); auto lr99 = convBnLeaky(network, weightMap, *cat98->getOutput(0), 128, 1, 1, 0, 99); auto lr100 = convBnLeaky(network, weightMap, *lr99->getOutput(0), 256, 3, 1, 1, 100); auto lr101 = convBnLeaky(network, weightMap, *lr100->getOutput(0), 128, 1, 1, 0, 101); auto lr102 = convBnLeaky(network, weightMap, *lr101->getOutput(0), 256, 3, 1, 1, 102); auto lr103 = convBnLeaky(network, weightMap, *lr102->getOutput(0), 128, 1, 1, 0, 103); auto lr104 = convBnLeaky(network, weightMap, *lr103->getOutput(0), 256, 3, 1, 1, 104); IConvolutionLayer* conv105 = network->addConvolutionNd(*lr104->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.105.Conv2d.weight"], weightMap["module_list.105.Conv2d.bias"]); assert(conv105); auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); ITensor* inputTensors_yolo[] = {conv81->getOutput(0), conv93->getOutput(0), conv105->getOutput(0)}; auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("yolov3.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("yolov3.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov3 -s // serialize model to plan file" << std::endl; std::cerr << "./yolov3 -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; int fcount = 0; for (auto f: file_names) { fcount++; std::cout << fcount << " " << f << std::endl; cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f); if (img.empty()) continue; cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = pr_img.at(i)[2] / 255.0; data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector res; nms(res, prob); for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("_" + f, img); } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: yolov3/yolov3_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov3TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] #print(output.shape) # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * 7001: (i + 1) * 7001], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) np.set_printoptions(suppress=True) #print("num:", num) #np.set_printoptions(threshold=sys.maxsize) #print(output[1:]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 7))[:num, :] if pred.shape[0] > 0: #print(pred[0]) pred[:,4] *= pred[:,6] pred = pred[:,:-1] #print(pred[0]) # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov3_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov3_wrapper = yolov3_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov3_wrapper.infer(self.yolov3_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov3_wrapper): threading.Thread.__init__(self) self.yolov3_wrapper = yolov3_wrapper def run(self): batch_image_raw, use_time = self.yolov3_wrapper.infer(self.yolov3_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libyololayer.so" engine_file_path = "build/yolov3.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov3_wrapper = YoLov3TRT(engine_file_path) try: print('batch size is', yolov3_wrapper.batch_size) image_dir = "samples/" image_path_batches = get_img_path_batches(yolov3_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov3_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov3_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov3_wrapper.destroy() ================================================ FILE: yolov3-spp/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolov3-spp) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) target_link_libraries(yololayer nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(yolov3-spp ${PROJECT_SOURCE_DIR}/yolov3-spp.cpp) target_link_libraries(yolov3-spp nvinfer) target_link_libraries(yolov3-spp cudart) target_link_libraries(yolov3-spp yololayer) target_link_libraries(yolov3-spp ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: yolov3-spp/README.md ================================================ # yolov3-spp Currently this is supporting dynamic input shape, if you want to use non-dynamic version, please checkout commit [659fd2b](https://github.com/wang-xinyu/tensorrtx/commit/659fd2b23482197b19dccf746a5a3dbff1611381). The Pytorch implementation is [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). It provides two trained weights of yolov3-spp, `yolov3-spp.pt` and `yolov3-spp-ultralytics.pt`(originally named `ultralytics68.pt`). ## Config - Number of classes defined in yololayer.h - FP16/FP32 can be selected by the macro in yolov3-spp.cpp - GPU id can be selected by the macro in yolov3-spp.cpp - NMS thresh in yolov3-spp.cpp - BBox confidence thresh in yolov3-spp.cpp - MIN and MAX input size defined in yolov3-spp.cpp - Optimization width and height for IOptimizationProfile defined in yolov3-spp.cpp ## How to Run 1. generate yolov3-spp_ultralytics68.wts from pytorch implementation with yolov3-spp.cfg and yolov3-spp-ultralytics.pt, or download .wts from model zoo ``` git clone https://github.com/wang-xinyu/tensorrtx.git git clone -b archive https://github.com/ultralytics/yolov3.git // download its weights 'yolov3-spp-ultralytics.pt' // copy gen_wts.py from tensorrtx/yolov3-spp/ to ultralytics/yolov3/ // go to ultralytics/yolov3/ python gen_wts.py yolov3-spp-ultralytics.pt // a file 'yolov3-spp_ultralytics68.wts' will be generated. // the master branch of yolov3 should work, if not, you can checkout 4ac60018f6e6c1e24b496485f126a660d9c793d8 ``` 2. build tensorrtx/yolov3-spp and run ``` // put yolov3-spp_ultralytics68.wts into tensorrtx/yolov3-spp/ // go to tensorrtx/yolov3-spp/ mkdir build cd build cmake .. make sudo ./yolov3-spp -s // serialize model to plan file i.e. 'yolov3-spp.engine' sudo ./yolov3-spp -d ../samples // deserialize plan file and run inference, the images in samples will be processed. ``` 3. check the images generated, as follows. _zidane.jpg and _bus.jpg

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov3-spp/Utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { class Profiler : public nvinfer1::IProfiler { public: void printLayerTimes(int itrationsTimes) { float totalTime = 0; for (size_t i = 0; i < mProfile.size(); i++) { printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); totalTime += mProfile[i].second; } printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); } private: typedef std::pair Record; std::vector mProfile; virtual void reportLayerTime(const char* layerName, float ms) { auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); if (record == mProfile.end()) mProfile.push_back(std::make_pair(layerName, ms)); else record->second += ms; } }; //Logger for TensorRT info/warning/errors class Logger : public nvinfer1::ILogger { public: Logger(): Logger(Severity::kWARNING) {} Logger(Severity severity): reportableSeverity(severity) {} void log(Severity severity, const char* msg) override { // suppress messages with severity enum value greater than the reportable if (severity > reportableSeverity) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cerr << "INFO: "; break; default: std::cerr << "UNKNOWN: "; break; } std::cerr << msg << std::endl; } Severity reportableSeverity{Severity::kWARNING}; }; template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } #endif ================================================ FILE: yolov3-spp/gen_wts.py ================================================ import struct import sys import torch from models import * # noqa: F403 from utils.utils import * # noqa: F403 model = Darknet('cfg/yolov3-spp.cfg', (416, 416)) # noqa: F405 weights = sys.argv[1] dev = '0' device = torch_utils.select_device(dev) # noqa: F405 model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model']) with open('yolov3-spp_ultralytics68.wts', 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov3-spp/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov3-spp/yololayer.cu ================================================ #include "yololayer.h" using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin() { mClassCount = CLASS_NUM; mYoloKernel.clear(); mYoloKernel.push_back(yolo1); mYoloKernel.push_back(yolo2); mYoloKernel.push_back(yolo3); mKernelCount = mYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t anchorLen = sizeof(float) * CHECK_COUNT * 2; for (int i = 0; i < mKernelCount; i++) { CUDA_CHECK(cudaMalloc(&mAnchor[i], anchorLen)); const auto& yolo = mYoloKernel[i]; CUDA_CHECK(cudaMemcpy(mAnchor[i], yolo.anchors, anchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { for (int i = 0; i < mKernelCount; i++) { CUDA_CHECK(cudaFree(mAnchor[i])); } CUDA_CHECK(cudaFreeHost(mAnchor)); } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(mYoloKernel.data(), d, kernelSize); d += kernelSize; assert(d == a + length); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t anchorLen = sizeof(float) * CHECK_COUNT * 2; for (int i = 0; i < mKernelCount; i++) { CUDA_CHECK(cudaMalloc(&mAnchor[i], anchorLen)); const auto& yolo = mYoloKernel[i]; CUDA_CHECK(cudaMemcpy(mAnchor[i], yolo.anchors, anchorLen, cudaMemcpyHostToDevice)); } } void YoloLayerPlugin::serialize(void* buffer) const { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(d,mYoloKernel.data(), kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); } int YoloLayerPlugin::initialize() { return 0; } DimsExprs YoloLayerPlugin::getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) { //output the result to channel int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); DimsExprs de; de.nbDims = 2; de.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue()); // batchsize de.d[1] = exprBuilder.constant(totalsize + 1); // outputsize return de; } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } void YoloLayerPlugin::configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs) { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() {} const char* YoloLayerPlugin::getPluginType() const { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const { return "1"; } void YoloLayerPlugin::destroy() { delete this; } // Clone the plugin IPluginV2DynamicExt* YoloLayerPlugin::clone() const { YoloLayerPlugin *p = new YoloLayerPlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int noElements, int yoloWidth, int yoloHeight, int yoloStride, const float anchors[CHECK_COUNT * 2], int classes, int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid*bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < 3; ++k) { int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue; float *res_count = output + bnIdx * outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= MAX_OUTPUT_BBOX_COUNT) return; char* data = (char*)res_count + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * yoloStride; det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * yoloStride; det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2 * k]; det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2 * k + 1]; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } } void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); for(int idx = 0 ; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float))); } int numElem = 0; for (size_t i = 0; i < mYoloKernel.size(); ++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width * yolo.height * batchSize; CalDetection<<<(yolo.width * yolo.height * batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[i], output, numElem, yolo.width, yolo.height, yolo.stride, (float*)mAnchor[i], mClassCount, outputElem); } } int YoloLayerPlugin::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) { int batchSize = inputDesc[0].dims.d[0]; for (size_t i = 0; i < mYoloKernel.size(); ++i) { mYoloKernel[i].width = inputDesc[i].dims.d[3]; mYoloKernel[i].height = inputDesc[i].dims.d[2]; } forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() { return &mFC; } IPluginV2DynamicExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { YoloLayerPlugin* obj = new YoloLayerPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2DynamicExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov3-spp/yololayer.h ================================================ #ifndef _YOLO_LAYER_H #define _YOLO_LAYER_H #include #include #include #include #include "NvInfer.h" #include "Utils.h" #include namespace Yolo { static constexpr int CHECK_COUNT = 3; static constexpr float IGNORE_THRESH = 0.1f; static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 80; struct YoloKernel { int width; int height; int stride; float anchors[CHECK_COUNT*2]; }; static constexpr YoloKernel yolo1 = { -1, // dynamic width and height -1, 32, {116,90, 156,198, 373,326} }; static constexpr YoloKernel yolo2 = { -1, -1, 16, {30,61, 62,45, 59,119} }; static constexpr YoloKernel yolo3 = { -1, -1, 8, {10,13, 16,30, 33,23} }; static constexpr int LOCATIONS = 4; struct alignas(float) Detection{ //x y w h float bbox[LOCATIONS]; float det_confidence; float class_id; float class_confidence; }; } namespace nvinfer1 { class YoloLayerPlugin: public IPluginV2DynamicExt { public: explicit YoloLayerPlugin(); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const override { return 1; } //virtual Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) final; virtual DimsExprs getOutputDimensions(int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) override; int initialize() override; virtual void terminate() override {}; //virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} size_t getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs, int nbOutputs) const override { return 0; } //virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; int enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2DynamicExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs) override; void detachFromContext() override; private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); int mClassCount; int mKernelCount; std::vector mYoloKernel; int mThreadCount = 256; void** mAnchor; const char* mPluginNamespace; }; class YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2DynamicExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif ================================================ FILE: yolov3-spp/yolov3-spp.cpp ================================================ #include #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include "yololayer.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.4 #define BBOX_CONF_THRESH 0.5 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int MAX_INPUT_SIZE = 608; static const int MIN_INPUT_SIZE = 128; static const int OPT_INPUT_W = 608; static const int OPT_INPUT_H = 608; static const int DET_LEN = sizeof(Yolo::Detection) / sizeof(float); static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DET_LEN + 1; // we limit the yololayer to output no more than MAX_OUTPUT_BBOX_COUNT bboxes const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; cv::Mat letterbox(cv::Mat& img) { float r = std::min(MAX_INPUT_SIZE / (img.cols*1.0), MAX_INPUT_SIZE / (img.rows*1.0)); r = std::min(r, 1.0f); int unpad_w = r * img.cols; int unpad_h = r * img.rows; int dw = (MAX_INPUT_SIZE - unpad_w) % 32; int dh = (MAX_INPUT_SIZE - unpad_h) % 32; cv::Mat re(unpad_h, unpad_w, CV_8UC3); cv::resize(img, re, re.size()); cv::Mat out(unpad_h + dh, unpad_w + dw, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(dw / 2, dh / 2, re.cols, re.rows))); return out; } cv::Rect get_rect(cv::Size src_shape, cv::Size pre_shape, float bbox[4]) { float ra = std::min(MAX_INPUT_SIZE / (src_shape.width * 1.0), MAX_INPUT_SIZE / (src_shape.height * 1.0)); ra = std::min(ra, 1.0f); int unpad_w = ra * src_shape.width; int unpad_h = ra * src_shape.height; int dw = (MAX_INPUT_SIZE - unpad_w) % 32; int dh = (MAX_INPUT_SIZE - unpad_h) % 32; int l = bbox[0] - bbox[2]/2.f - dw / 2; int r = bbox[0] + bbox[2]/2.f - dw / 2; int t = bbox[1] - bbox[3]/2.f - dh / 2; int b = bbox[1] + bbox[3]/2.f - dh / 2; l /= ra; r /= ra; t /= ra; b /= ra; return cv::Rect(l, t, r-l, b-t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.det_confidence > b.det_confidence; } void nms(std::vector& res, float *output, float nms_thresh = NMS_THRESH) { std::map> m; for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) { if (output[1 + DET_LEN * i + 4] <= BBOX_CONF_THRESH) continue; Yolo::Detection det; memcpy(&det, &output[1 + DET_LEN * i], DET_LEN * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); return lr; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); auto network = builder->createNetworkV2(explicitBatch); ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{1, 3, -1, -1}); assert(data); std::map weightMap = loadWeights("../yolov3-spp_ultralytics68.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // Yeah I am stupid, I just want to expand the complete arch of darknet.. auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1); auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2); auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3); auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM); auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5); auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6); auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7); auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM); auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9); auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10); auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM); auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12); auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13); auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14); auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM); auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16); auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17); auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM); auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19); auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20); auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM); auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22); auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23); auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM); auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25); auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26); auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM); auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28); auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29); auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM); auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31); auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32); auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM); auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34); auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35); auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM); auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37); auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38); auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39); auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM); auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41); auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42); auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM); auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44); auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45); auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM); auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47); auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48); auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM); auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50); auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51); auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM); auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53); auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54); auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM); auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56); auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57); auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM); auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59); auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60); auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM); auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62); auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63); auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64); auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM); auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66); auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67); auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM); auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69); auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70); auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM); auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72); auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73); auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM); auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75); auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76); auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77); auto pool78 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{5,5}); pool78->setPaddingNd(DimsHW{2, 2}); pool78->setStrideNd(DimsHW{1, 1}); auto pool80 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{9,9}); pool80->setPaddingNd(DimsHW{4, 4}); pool80->setStrideNd(DimsHW{1, 1}); auto pool82 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{13,13}); pool82->setPaddingNd(DimsHW{6, 6}); pool82->setStrideNd(DimsHW{1, 1}); ITensor* inputTensors83[] = {pool82->getOutput(0), pool80->getOutput(0), pool78->getOutput(0), lr77->getOutput(0)}; auto cat83 = network->addConcatenation(inputTensors83, 4); auto lr84 = convBnLeaky(network, weightMap, *cat83->getOutput(0), 512, 1, 1, 0, 84); auto lr85 = convBnLeaky(network, weightMap, *lr84->getOutput(0), 1024, 3, 1, 1, 85); auto lr86 = convBnLeaky(network, weightMap, *lr85->getOutput(0), 512, 1, 1, 0, 86); auto lr87 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 1024, 3, 1, 1, 87); IConvolutionLayer* conv88 = network->addConvolutionNd(*lr87->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.88.Conv2d.weight"], weightMap["module_list.88.Conv2d.bias"]); assert(conv88); auto lr91 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 256, 1, 1, 0, 91); float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); for (int i = 0; i < 256 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts92{DataType::kFLOAT, deval, 256 * 2 * 2}; IDeconvolutionLayer* deconv92 = network->addDeconvolutionNd(*lr91->getOutput(0), 256, DimsHW{2, 2}, deconvwts92, emptywts); assert(deconv92); deconv92->setStrideNd(DimsHW{2, 2}); deconv92->setNbGroups(256); weightMap["deconv92"] = deconvwts92; ITensor* inputTensors[] = {deconv92->getOutput(0), ew61->getOutput(0)}; auto cat93 = network->addConcatenation(inputTensors, 2); auto lr94 = convBnLeaky(network, weightMap, *cat93->getOutput(0), 256, 1, 1, 0, 94); auto lr95 = convBnLeaky(network, weightMap, *lr94->getOutput(0), 512, 3, 1, 1, 95); auto lr96 = convBnLeaky(network, weightMap, *lr95->getOutput(0), 256, 1, 1, 0, 96); auto lr97 = convBnLeaky(network, weightMap, *lr96->getOutput(0), 512, 3, 1, 1, 97); auto lr98 = convBnLeaky(network, weightMap, *lr97->getOutput(0), 256, 1, 1, 0, 98); auto lr99 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 512, 3, 1, 1, 99); IConvolutionLayer* conv100 = network->addConvolutionNd(*lr99->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.100.Conv2d.weight"], weightMap["module_list.100.Conv2d.bias"]); assert(conv100); auto lr103 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 128, 1, 1, 0, 103); Weights deconvwts104{DataType::kFLOAT, deval, 128 * 2 * 2}; IDeconvolutionLayer* deconv104 = network->addDeconvolutionNd(*lr103->getOutput(0), 128, DimsHW{2, 2}, deconvwts104, emptywts); assert(deconv104); deconv104->setStrideNd(DimsHW{2, 2}); deconv104->setNbGroups(128); ITensor* inputTensors1[] = {deconv104->getOutput(0), ew36->getOutput(0)}; auto cat105 = network->addConcatenation(inputTensors1, 2); auto lr106 = convBnLeaky(network, weightMap, *cat105->getOutput(0), 128, 1, 1, 0, 106); auto lr107 = convBnLeaky(network, weightMap, *lr106->getOutput(0), 256, 3, 1, 1, 107); auto lr108 = convBnLeaky(network, weightMap, *lr107->getOutput(0), 128, 1, 1, 0, 108); auto lr109 = convBnLeaky(network, weightMap, *lr108->getOutput(0), 256, 3, 1, 1, 109); auto lr110 = convBnLeaky(network, weightMap, *lr109->getOutput(0), 128, 1, 1, 0, 110); auto lr111 = convBnLeaky(network, weightMap, *lr110->getOutput(0), 256, 3, 1, 1, 111); IConvolutionLayer* conv112 = network->addConvolutionNd(*lr111->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.112.Conv2d.weight"], weightMap["module_list.112.Conv2d.bias"]); assert(conv112); auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); ITensor* inputTensors_yolo[] = {conv88->getOutput(0), conv100->getOutput(0), conv112->getOutput(0)}; auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); auto dim = yolo->getOutput(0)->getDimensions(); std::cout << "yololayer output shape: "; for (int i = 0; i < dim.nbDims; i++) { std::cout << dim.d[i] << " "; } std::cout << std::endl; yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); IOptimizationProfile* profile = builder->createOptimizationProfile(); profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(1, 3, MIN_INPUT_SIZE, MIN_INPUT_SIZE)); profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(1, 3, OPT_INPUT_H, OPT_INPUT_W)); profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(1, 3, MAX_INPUT_SIZE, MAX_INPUT_SIZE)); config->addOptimizationProfile(profile); // Build engine config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, cv::Size input_shape) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); context.setBindingDimensions(inputIndex, Dims4(1, 3, input_shape.height, input_shape.width)); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueueV2(buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("yolov3-spp.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("yolov3-spp.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov3-spp -s // serialize model to plan file" << std::endl; std::cerr << "./yolov3-spp -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } static float prob[OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; context->setOptimizationProfile(0); int fcount = 0; for (auto f: file_names) { fcount++; std::cout << fcount << " " << f << std::endl; cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f); if (img.empty()) continue; cv::Mat pr_img = letterbox(img); std::cout << "letterbox shape: " << pr_img.cols << ", " << pr_img.rows << std::endl; if (pr_img.cols < MIN_INPUT_SIZE || pr_img.rows < MIN_INPUT_SIZE) continue; cv::Mat blob = cv::dnn::blobFromImage(pr_img, 1.0 / 255.0, pr_img.size(), cv::Scalar(0, 0, 0), true, false); // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, blob.ptr(0), prob, pr_img.size()); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector res; nms(res, prob); std::cout << "num of bbox: " << res.size() << std::endl; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img.size(), pr_img.size(), res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("_" + f, img); } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: yolov3-tiny/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolov3-tiny) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu) cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) target_link_libraries(yololayer nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(yolov3-tiny ${PROJECT_SOURCE_DIR}/yolov3-tiny.cpp) target_link_libraries(yolov3-tiny nvinfer) target_link_libraries(yolov3-tiny cudart) target_link_libraries(yolov3-tiny yololayer) target_link_libraries(yolov3-tiny ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: yolov3-tiny/README.md ================================================ # yolov3-tiny The Pytorch implementation is [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). ## Excute: ``` 1. generate yolov3-tiny.wts from pytorch implementation with yolov3-tiny.cfg and yolov3-tiny.weights, or download .wts from model zoo git clone -b archive https://github.com/ultralytics/yolov3.git // download its weights 'yolov3-tiny.pt' or 'yolov3-tiny.weights' // put tensorrtx/yolov3-tiny/gen_wts.py into ultralytics/yolov3 and run python gen_wts.py yolov3-tiny.weights // a file 'yolov3-tiny.wts' will be generated. 2. put yolov3-tiny.wts into tensorrtx/yolov3-tiny, build and run // go to tensorrtx/yolov3-tiny mkdir build cd build cmake .. make sudo ./yolov3-tiny -s // serialize model to plan file i.e. 'yolov3-tiny.engine' sudo ./yolov3-tiny -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed. 3. check the images generated, as follows. _zidane.jpg and _bus.jpg ```

## Config - Input shape defined in yololayer.h - Number of classes defined in yololayer.h - FP16/FP32 can be selected by the macro in yolov3-tiny.cpp - GPU id can be selected by the macro in yolov3-tiny.cpp - NMS thresh in yolov3-tiny.cpp - BBox confidence thresh in yolov3-tiny.cpp ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov3-tiny/gen_wts.py ================================================ import struct import sys import torch from models import * # noqa: F403 from utils.utils import * # noqa: F403 model = Darknet('cfg/yolov3-tiny.cfg', (608, 608)) # noqa: F405 weights = sys.argv[1] device = torch_utils.select_device('0') # noqa: F405 if weights.endswith('.pt'): # pytorch format model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model']) else: # darknet format load_darknet_weights(model, weights) # noqa: F405 model = model.eval() with open('yolov3-tiny.wts', 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov3-tiny/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov3-tiny/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov3-tiny/utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #include "macros.h" #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { class Profiler : public nvinfer1::IProfiler { public: void printLayerTimes(int itrationsTimes) { float totalTime = 0; for (size_t i = 0; i < mProfile.size(); i++) { printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); totalTime += mProfile[i].second; } printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); } private: typedef std::pair Record; std::vector mProfile; virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT { auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); if (record == mProfile.end()) mProfile.push_back(std::make_pair(layerName, ms)); else record->second += ms; } }; //Logger for TensorRT info/warning/errors class Logger : public nvinfer1::ILogger { public: Logger(): Logger(Severity::kWARNING) {} Logger(Severity severity): reportableSeverity(severity) {} void log(Severity severity, const char* msg) TRT_NOEXCEPT override { // suppress messages with severity enum value greater than the reportable if (severity > reportableSeverity) return; switch (severity) { case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; case Severity::kERROR: std::cerr << "ERROR: "; break; case Severity::kWARNING: std::cerr << "WARNING: "; break; case Severity::kINFO: std::cerr << "INFO: "; break; default: std::cerr << "UNKNOWN: "; break; } std::cerr << msg << std::endl; } Severity reportableSeverity{Severity::kWARNING}; }; template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } #endif ================================================ FILE: yolov3-tiny/yololayer.cu ================================================ #include #include "yololayer.h" #include "utils.h" using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin() { mClassCount = CLASS_NUM; mYoloKernel.clear(); mYoloKernel.push_back(yolo1); mYoloKernel.push_back(yolo2); mKernelCount = mYoloKernel.size(); } YoloLayerPlugin::~YoloLayerPlugin() { } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(mYoloKernel.data(),d,kernelSize); d += kernelSize; assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(d,mYoloKernel.data(),kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin *p = new YoloLayerPlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output,int noElements, int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid*bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < 3; ++k) { int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue; float *res_count = output + bnIdx*outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= MAX_OUTPUT_BBOX_COUNT) return; char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection); Detection* det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k]; det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1]; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } } void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { void* devAnchor; size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen)); int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); for(int idx = 0 ; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); } int numElem = 0; for (unsigned int i = 0;i< mYoloKernel.size();++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width*yolo.height*batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem); } CUDA_CHECK(cudaFree(devAnchor)); } int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { YoloLayerPlugin* obj = new YoloLayerPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call MishPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov3-tiny/yololayer.h ================================================ #ifndef _YOLO_LAYER_H #define _YOLO_LAYER_H #include #include #include "NvInfer.h" #include "macros.h" namespace Yolo { static constexpr int CHECK_COUNT = 3; static constexpr float IGNORE_THRESH = 0.1f; static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 80; static constexpr int INPUT_H = 608; static constexpr int INPUT_W = 608; struct YoloKernel { int width; int height; float anchors[CHECK_COUNT*2]; }; static constexpr YoloKernel yolo1 = { INPUT_W / 32, INPUT_H / 32, {81,82, 135,169, 344,319} }; static constexpr YoloKernel yolo2 = { INPUT_W / 16, INPUT_H / 16, {23,27, 37,58, 81,82} }; static constexpr int LOCATIONS = 4; struct alignas(float) Detection{ //x y w h float bbox[LOCATIONS]; float det_confidence; float class_id; float class_confidence; }; } namespace nvinfer1 { class YoloLayerPlugin: public IPluginV2IOExt { public: explicit YoloLayerPlugin(); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); int mClassCount; int mKernelCount; std::vector mYoloKernel; int mThreadCount = 256; const char* mPluginNamespace; }; class YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif ================================================ FILE: yolov3-tiny/yolov3-tiny.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #include "yololayer.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.5 #define BBOX_CONF_THRESH 0.4 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int OUTPUT_SIZE = 1000 * 7 + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; cv::Mat preprocess_img(cv::Mat& img) { int w, h, x, y; float r_w = INPUT_W / (img.cols*1.0); float r_h = INPUT_H / (img.rows*1.0); if (r_h > r_w) { w = INPUT_W; h = r_w * img.rows; x = 0; y = (INPUT_H - h) / 2; } else { w = r_h* img.cols; h = INPUT_H; x = (INPUT_W - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } cv::Rect get_rect(cv::Mat& img, float bbox[4]) { int l, r, t, b; float r_w = INPUT_W / (img.cols * 1.0); float r_h = INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2]/2.f; r = bbox[0] + bbox[2]/2.f; t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; t = bbox[1] - bbox[3]/2.f; b = bbox[1] + bbox[3]/2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(l, t, r-l, b-t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.det_confidence > b.det_confidence; } void nms(std::vector& res, float *output, float nms_thresh = NMS_THRESH) { std::map> m; for (int i = 0; i < output[0] && i < 1000; i++) { if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue; Yolo::Detection det; memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); return lr; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../yolov3-tiny.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; auto lr0 = convBnLeaky(network, weightMap, *data, 16, 3, 1, 1, 0); auto pool1 = network->addPoolingNd(*lr0->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool1->setStrideNd(DimsHW{2, 2}); auto lr2 = convBnLeaky(network, weightMap, *pool1->getOutput(0), 32, 3, 1, 1, 2); auto pool3 = network->addPoolingNd(*lr2->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool3->setStrideNd(DimsHW{2, 2}); auto lr4 = convBnLeaky(network, weightMap, *pool3->getOutput(0), 64, 3, 1, 1, 4); auto pool5 = network->addPoolingNd(*lr4->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool5->setStrideNd(DimsHW{2, 2}); auto lr6 = convBnLeaky(network, weightMap, *pool5->getOutput(0), 128, 3, 1, 1, 6); auto pool7 = network->addPoolingNd(*lr6->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool7->setStrideNd(DimsHW{2, 2}); auto lr8 = convBnLeaky(network, weightMap, *pool7->getOutput(0), 256, 3, 1, 1, 8); auto pool9 = network->addPoolingNd(*lr8->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool9->setStrideNd(DimsHW{2, 2}); auto lr10 = convBnLeaky(network, weightMap, *pool9->getOutput(0), 512, 3, 1, 1, 10); auto pad11 = network->addPaddingNd(*lr10->getOutput(0), DimsHW{0, 0}, DimsHW{1, 1}); auto pool11 = network->addPoolingNd(*pad11->getOutput(0), PoolingType::kMAX, DimsHW{2, 2}); pool11->setStrideNd(DimsHW{1, 1}); auto lr12 = convBnLeaky(network, weightMap, *pool11->getOutput(0), 1024, 3, 1, 1, 12); auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 256, 1, 1, 0, 13); auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 512, 3, 1, 1, 14); IConvolutionLayer* conv15 = network->addConvolutionNd(*lr14->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.15.Conv2d.weight"], weightMap["module_list.15.Conv2d.bias"]); // 16 is yolo auto l17 = lr13; auto lr18 = convBnLeaky(network, weightMap, *l17->getOutput(0), 128, 1, 1, 0, 18); float *deval = reinterpret_cast(malloc(sizeof(float) * 128 * 2 * 2)); for (int i = 0; i < 128 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts19{DataType::kFLOAT, deval, 128 * 2 * 2}; IDeconvolutionLayer* deconv19 = network->addDeconvolutionNd(*lr18->getOutput(0), 128, DimsHW{2, 2}, deconvwts19, emptywts); assert(deconv19); deconv19->setStrideNd(DimsHW{2, 2}); deconv19->setNbGroups(128); weightMap["deconv19"] = deconvwts19; ITensor* inputTensors[] = {deconv19->getOutput(0), lr8->getOutput(0)}; auto cat20 = network->addConcatenation(inputTensors, 2); auto lr21 = convBnLeaky(network, weightMap, *cat20->getOutput(0), 256, 3, 1, 1, 21); IConvolutionLayer* conv22 = network->addConvolutionNd(*lr21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.22.Conv2d.weight"], weightMap["module_list.22.Conv2d.bias"]); // 22 is yolo auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); ITensor* inputTensors_yolo[] = {conv15->getOutput(0), conv22->getOutput(0)}; auto yolo = network->addPluginV2(inputTensors_yolo, 2, *pluginObj); yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(1, &modelStream); assert(modelStream != nullptr); std::ofstream p("yolov3-tiny.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("yolov3-tiny.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov3-tiny -s // serialize model to plan file" << std::endl; std::cerr << "./yolov3-tiny -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; int fcount = 0; for (auto f: file_names) { fcount++; std::cout << fcount << " " << f << std::endl; cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f); if (img.empty()) continue; cv::Mat pr_img = preprocess_img(img); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[i] = pr_img.at(i)[2] / 255.0; data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, 1); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector res; nms(res, prob); for (int i=0; i<20; i++) { std::cout << prob[i] << ","; } std::cout << res.size() << std::endl; for (size_t j = 0; j < res.size(); j++) { float *p = (float*)&res[j]; for (size_t k = 0; k < 7; k++) { std::cout << p[k] << ", "; } std::cout << std::endl; cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("_" + f, img); } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: yolov4/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolov4) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/usr/include/x86_64-linux-gnu/) link_directories(/usr/lib/x86_64-linux-gnu/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(yolov4 ${PROJECT_SOURCE_DIR}/yolov4.cpp) target_link_libraries(yolov4 nvinfer) target_link_libraries(yolov4 cudart) target_link_libraries(yolov4 myplugins) target_link_libraries(yolov4 ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: yolov4/README.md ================================================ # yolov4 The Pytorch implementation is from [ultralytics/yolov3 archive branch](https://github.com/ultralytics/yolov3/tree/archive). It can load yolov4.cfg and yolov4.weights(from AlexeyAB/darknet). ## Config - Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h - Number of classes `CLASS_NUM` defined in yololayer.h - FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp - GPU id can be selected by the macro `DEVICE` in yolov4.cpp - NMS thresh `NMS_THRESH` in yolov4.cpp - bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp - `BATCH_SIZE` in yolov4.cpp ## How to run 1. generate yolov4.wts from pytorch implementation with yolov4.cfg and yolov4.weights, or download .wts from model zoo ``` git clone https://github.com/wang-xinyu/tensorrtx.git git clone -b archive https://github.com/ultralytics/yolov3.git // download yolov4.weights from https://github.com/AlexeyAB/darknet#pre-trained-models cp {tensorrtx}/yolov4/gen_wts.py {ultralytics/yolov3/} cd {ultralytics/yolov3/} python gen_wts.py yolov4.weights // a file 'yolov4.wts' will be generated. // the master branch of yolov3 should work, if not, you can checkout be87b41aa2fe59be8e62f4b488052b24ad0bd450 ``` 2. put yolov4.wts into {tensorrtx}/yolov4, build and run ``` mv yolov4.wts {tensorrtx}/yolov4/ cd {tensorrtx}/yolov4 mkdir build cd build cmake .. make sudo ./yolov4 -s // serialize model to plan file i.e. 'yolov4.engine' sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed. ``` 3. check the images generated, as follows. _zidane.jpg and _bus.jpg

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov4/gen_wts.py ================================================ import struct import sys import torch from models import * # noqa: F403 from utils.utils import * # noqa: F403 model = Darknet('cfg/yolov4.cfg', (608, 608)) # noqa: F405 weights = sys.argv[1] device = torch_utils.select_device('0') # noqa: F405 if weights.endswith('.pt'): # pytorch format model.load_state_dict(torch.load(weights, map_location=device, weights_only=False)['model']) else: # darknet format load_darknet_weights(model, weights) # noqa: F405 with open('yolov4.wts', 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov4/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov4/mish.cu ================================================ #include #include #include #include #include "mish.h" namespace nvinfer1 { MishPlugin::MishPlugin() { } MishPlugin::~MishPlugin() { } // create the plugin at runtime from a byte stream MishPlugin::MishPlugin(const void* data, size_t length) { assert(length == sizeof(input_size_)); input_size_ = *reinterpret_cast(data); } void MishPlugin::serialize(void* buffer) const { *reinterpret_cast(buffer) = input_size_; } size_t MishPlugin::getSerializationSize() const { return sizeof(input_size_); } int MishPlugin::initialize() { return 0; } Dims MishPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { assert(nbInputDims == 1); assert(index == 0); input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2]; // Output dimensions return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); } // Set plugin namespace void MishPlugin::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* MishPlugin::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType MishPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool MishPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool MishPlugin::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void MishPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void MishPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void MishPlugin::detachFromContext() {} const char* MishPlugin::getPluginType() const { return "Mish_TRT"; } const char* MishPlugin::getPluginVersion() const { return "1"; } void MishPlugin::destroy() { delete this; } // Clone the plugin IPluginV2IOExt* MishPlugin::clone() const { MishPlugin *p = new MishPlugin(); p->input_size_ = input_size_; p->setPluginNamespace(mPluginNamespace); return p; } __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);} __device__ float softplus_kernel(float x, float threshold = 20) { if (x > threshold) return x; // too large else if (x < -threshold) return expf(x); // too small return logf(expf(x) + 1); } __global__ void mish_kernel(const float *input, float *output, int num_elem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= num_elem) return; //float t = exp(input[idx]); //if (input[idx] > 20.0) { // t *= t; // output[idx] = (t - 1.0) / (t + 1.0); //} else { // float tt = t * t; // output[idx] = (tt + 2.0 * t) / (tt + 2.0 * t + 2.0); //} //output[idx] *= input[idx]; output[idx] = input[idx] * tanh_activate_kernel(softplus_kernel(input[idx])); } void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int block_size = thread_count_; int grid_size = (input_size_ * batchSize + block_size - 1) / block_size; mish_kernel<<>>(inputs[0], output, input_size_ * batchSize); } int MishPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection MishPluginCreator::mFC{}; std::vector MishPluginCreator::mPluginAttributes; MishPluginCreator::MishPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* MishPluginCreator::getPluginName() const { return "Mish_TRT"; } const char* MishPluginCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* MishPluginCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* MishPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { MishPlugin* obj = new MishPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* MishPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call MishPlugin::destroy() MishPlugin* obj = new MishPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov4/mish.h ================================================ #ifndef _MISH_PLUGIN_H #define _MISH_PLUGIN_H #include #include #include "NvInfer.h" namespace nvinfer1 { class MishPlugin: public IPluginV2IOExt { public: explicit MishPlugin(); MishPlugin(const void* data, size_t length); ~MishPlugin(); int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; int input_size_; private: void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1); int thread_count_ = 256; const char* mPluginNamespace; }; class MishPluginCreator : public IPluginCreator { public: MishPluginCreator(); ~MishPluginCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(MishPluginCreator); }; #endif ================================================ FILE: yolov4/utils.h ================================================ #ifndef __TRT_UTILS_H_ #define __TRT_UTILS_H_ #include #include #include #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } #endif ================================================ FILE: yolov4/yololayer.cu ================================================ #include #include "yololayer.h" #include "utils.h" using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin() { mClassCount = CLASS_NUM; mYoloKernel.clear(); mYoloKernel.push_back(yolo1); mYoloKernel.push_back(yolo2); mYoloKernel.push_back(yolo3); mKernelCount = mYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; for(int ii = 0; ii < mKernelCount; ii ++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(mYoloKernel.data(),d,kernelSize); d += kernelSize; CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; for(int ii = 0; ii < mKernelCount; ii ++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); auto kernelSize = mKernelCount*sizeof(YoloKernel); memcpy(d,mYoloKernel.data(),kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); } int YoloLayerPlugin::initialize() { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) { //output the result to channel int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() {} const char* YoloLayerPlugin::getPluginType() const { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const { return "1"; } void YoloLayerPlugin::destroy() { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const { YoloLayerPlugin *p = new YoloLayerPlugin(); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data){ return 1./(1. + exp(-data)); }; __global__ void CalDetection(const float *input, float *output,int noElements, int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid*bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < 3; ++k) { int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue; float *res_count = output + bnIdx*outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= MAX_OUTPUT_BBOX_COUNT) return; char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection); Detection* det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; det->bbox[2] = exp(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k]; det->bbox[3] = exp(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1]; det->det_confidence = box_prob; det->class_id = class_id; det->class_confidence = max_cls_prob; } } void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); for(int idx = 0 ; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); } int numElem = 0; for (unsigned int i = 0;i< mYoloKernel.size();++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width*yolo.height*batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> (inputs[i],output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount ,outputElem); } } int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) { //assert(batchSize == 1); //GPU //CUDA_CHECK(cudaStreamSynchronize(stream)); forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) { YoloLayerPlugin* obj = new YoloLayerPlugin(); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) { // This object will be deleted when the network is destroyed, which will // call MishPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov4/yololayer.h ================================================ #ifndef _YOLO_LAYER_H #define _YOLO_LAYER_H #include #include #include "NvInfer.h" namespace Yolo { static constexpr int CHECK_COUNT = 3; static constexpr float IGNORE_THRESH = 0.1f; static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 80; static constexpr int INPUT_H = 608; static constexpr int INPUT_W = 608; struct YoloKernel { int width; int height; float anchors[CHECK_COUNT*2]; }; static constexpr YoloKernel yolo1 = { INPUT_W / 8, INPUT_H / 8, {12,16, 19,36, 40,28} }; static constexpr YoloKernel yolo2 = { INPUT_W / 16, INPUT_H / 16, {36,75, 76,55, 72,146} }; static constexpr YoloKernel yolo3 = { INPUT_W / 32, INPUT_H / 32, {142,110, 192,243, 459,401} }; static constexpr int LOCATIONS = 4; struct alignas(float) Detection{ //x y w h float bbox[LOCATIONS]; float det_confidence; float class_id; float class_confidence; }; } namespace nvinfer1 { class YoloLayerPlugin: public IPluginV2IOExt { public: explicit YoloLayerPlugin(); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; int initialize() override; virtual void terminate() override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; virtual size_t getSerializationSize() const override; virtual void serialize(void* buffer) const override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const override; const char* getPluginVersion() const override; void destroy() override; IPluginV2IOExt* clone() const override; void setPluginNamespace(const char* pluginNamespace) override; const char* getPluginNamespace() const override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; bool canBroadcastInputAcrossBatch(int inputIndex) const override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; void detachFromContext() override; private: void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); int mClassCount; int mKernelCount; std::vector mYoloKernel; int mThreadCount = 256; void** mAnchor; const char* mPluginNamespace; }; class YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const override; const char* getPluginVersion() const override; const PluginFieldCollection* getFieldNames() override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; } const char* getPluginNamespace() const override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif ================================================ FILE: yolov4/yolov4.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "utils.h" #include "cuda_runtime_api.h" #include "logging.h" #include "yololayer.h" #include "mish.h" #define USE_FP16 // comment out this if want to use FP32 #define DEVICE 0 // GPU id #define NMS_THRESH 0.4 #define BBOX_CONF_THRESH 0.5 #define BATCH_SIZE 1 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int DETECTION_SIZE = sizeof(Yolo::Detection) / sizeof(float); static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * DETECTION_SIZE + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; static Logger gLogger; cv::Mat preprocess_img(cv::Mat& img) { int w, h, x, y; float r_w = INPUT_W / (img.cols*1.0); float r_h = INPUT_H / (img.rows*1.0); if (r_h > r_w) { w = INPUT_W; h = r_w * img.rows; x = 0; y = (INPUT_H - h) / 2; } else { w = r_h* img.cols; h = INPUT_H; x = (INPUT_W - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size()); cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } cv::Rect get_rect(cv::Mat& img, float bbox[4]) { int l, r, t, b; float r_w = INPUT_W / (img.cols * 1.0); float r_h = INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2]/2.f; r = bbox[0] + bbox[2]/2.f; t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; t = bbox[1] - bbox[3]/2.f; b = bbox[1] + bbox[3]/2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(l, t, r-l, b-t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom }; if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.det_confidence > b.det_confidence; } void nms(std::vector& res, float *output, float nms_thresh = NMS_THRESH) { std::map> m; for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) { if (output[1 + DETECTION_SIZE * i + 4] <= BBOX_CONF_THRESH) continue; Yolo::Detection det; memcpy(&det, &output[1 + DETECTION_SIZE * i], DETECTION_SIZE * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin()+n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file."); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{DataType::kFLOAT, scval, len}; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{DataType::kFLOAT, shval, len}; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } ILayer* convBnMish(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4); auto creator = getPluginRegistry()->getPluginCreator("Mish_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin(("mish" + std::to_string(linx)).c_str(), pluginData); ITensor* inputTensors[] = {bn1->getOutput(0)}; auto mish = network->addPluginV2(&inputTensors[0], 1, *pluginObj); return mish; } ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-4); auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); return lr; } // Creat the engine using only the API and not any parser. ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); assert(data); std::map weightMap = loadWeights("../yolov4.wts"); Weights emptywts{DataType::kFLOAT, nullptr, 0}; // define each layer. auto l0 = convBnMish(network, weightMap, *data, 32, 3, 1, 1, 0); auto l1 = convBnMish(network, weightMap, *l0->getOutput(0), 64, 3, 2, 1, 1); auto l2 = convBnMish(network, weightMap, *l1->getOutput(0), 64, 1, 1, 0, 2); auto l3 = l1; auto l4 = convBnMish(network, weightMap, *l3->getOutput(0), 64, 1, 1, 0, 4); auto l5 = convBnMish(network, weightMap, *l4->getOutput(0), 32, 1, 1, 0, 5); auto l6 = convBnMish(network, weightMap, *l5->getOutput(0), 64, 3, 1, 1, 6); auto ew7 = network->addElementWise(*l6->getOutput(0), *l4->getOutput(0), ElementWiseOperation::kSUM); auto l8 = convBnMish(network, weightMap, *ew7->getOutput(0), 64, 1, 1, 0, 8); ITensor* inputTensors9[] = {l8->getOutput(0), l2->getOutput(0)}; auto cat9 = network->addConcatenation(inputTensors9, 2); auto l10 = convBnMish(network, weightMap, *cat9->getOutput(0), 64, 1, 1, 0, 10); auto l11 = convBnMish(network, weightMap, *l10->getOutput(0), 128, 3, 2, 1, 11); auto l12 = convBnMish(network, weightMap, *l11->getOutput(0), 64, 1, 1, 0, 12); auto l13 = l11; auto l14 = convBnMish(network, weightMap, *l13->getOutput(0), 64, 1, 1, 0, 14); auto l15 = convBnMish(network, weightMap, *l14->getOutput(0), 64, 1, 1, 0, 15); auto l16 = convBnMish(network, weightMap, *l15->getOutput(0), 64, 3, 1, 1, 16); auto ew17 = network->addElementWise(*l16->getOutput(0), *l14->getOutput(0), ElementWiseOperation::kSUM); auto l18 = convBnMish(network, weightMap, *ew17->getOutput(0), 64, 1, 1, 0, 18); auto l19 = convBnMish(network, weightMap, *l18->getOutput(0), 64, 3, 1, 1, 19); auto ew20 = network->addElementWise(*l19->getOutput(0), *ew17->getOutput(0), ElementWiseOperation::kSUM); auto l21 = convBnMish(network, weightMap, *ew20->getOutput(0), 64, 1, 1, 0, 21); ITensor* inputTensors22[] = {l21->getOutput(0), l12->getOutput(0)}; auto cat22 = network->addConcatenation(inputTensors22, 2); auto l23 = convBnMish(network, weightMap, *cat22->getOutput(0), 128, 1, 1, 0, 23); auto l24 = convBnMish(network, weightMap, *l23->getOutput(0), 256, 3, 2, 1, 24); auto l25 = convBnMish(network, weightMap, *l24->getOutput(0), 128, 1, 1, 0, 25); auto l26 = l24; auto l27 = convBnMish(network, weightMap, *l26->getOutput(0), 128, 1, 1, 0, 27); auto l28 = convBnMish(network, weightMap, *l27->getOutput(0), 128, 1, 1, 0, 28); auto l29 = convBnMish(network, weightMap, *l28->getOutput(0), 128, 3, 1, 1, 29); auto ew30 = network->addElementWise(*l29->getOutput(0), *l27->getOutput(0), ElementWiseOperation::kSUM); auto l31 = convBnMish(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31); auto l32 = convBnMish(network, weightMap, *l31->getOutput(0), 128, 3, 1, 1, 32); auto ew33 = network->addElementWise(*l32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM); auto l34 = convBnMish(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34); auto l35 = convBnMish(network, weightMap, *l34->getOutput(0), 128, 3, 1, 1, 35); auto ew36 = network->addElementWise(*l35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM); auto l37 = convBnMish(network, weightMap, *ew36->getOutput(0), 128, 1, 1, 0, 37); auto l38 = convBnMish(network, weightMap, *l37->getOutput(0), 128, 3, 1, 1, 38); auto ew39 = network->addElementWise(*l38->getOutput(0), *ew36->getOutput(0), ElementWiseOperation::kSUM); auto l40 = convBnMish(network, weightMap, *ew39->getOutput(0), 128, 1, 1, 0, 40); auto l41 = convBnMish(network, weightMap, *l40->getOutput(0), 128, 3, 1, 1, 41); auto ew42 = network->addElementWise(*l41->getOutput(0), *ew39->getOutput(0), ElementWiseOperation::kSUM); auto l43 = convBnMish(network, weightMap, *ew42->getOutput(0), 128, 1, 1, 0, 43); auto l44 = convBnMish(network, weightMap, *l43->getOutput(0), 128, 3, 1, 1, 44); auto ew45 = network->addElementWise(*l44->getOutput(0), *ew42->getOutput(0), ElementWiseOperation::kSUM); auto l46 = convBnMish(network, weightMap, *ew45->getOutput(0), 128, 1, 1, 0, 46); auto l47 = convBnMish(network, weightMap, *l46->getOutput(0), 128, 3, 1, 1, 47); auto ew48 = network->addElementWise(*l47->getOutput(0), *ew45->getOutput(0), ElementWiseOperation::kSUM); auto l49 = convBnMish(network, weightMap, *ew48->getOutput(0), 128, 1, 1, 0, 49); auto l50 = convBnMish(network, weightMap, *l49->getOutput(0), 128, 3, 1, 1, 50); auto ew51 = network->addElementWise(*l50->getOutput(0), *ew48->getOutput(0), ElementWiseOperation::kSUM); auto l52 = convBnMish(network, weightMap, *ew51->getOutput(0), 128, 1, 1, 0, 52); ITensor* inputTensors53[] = {l52->getOutput(0), l25->getOutput(0)}; auto cat53 = network->addConcatenation(inputTensors53, 2); auto l54 = convBnMish(network, weightMap, *cat53->getOutput(0), 256, 1, 1, 0, 54); auto l55 = convBnMish(network, weightMap, *l54->getOutput(0), 512, 3, 2, 1, 55); auto l56 = convBnMish(network, weightMap, *l55->getOutput(0), 256, 1, 1, 0, 56); auto l57 = l55; auto l58 = convBnMish(network, weightMap, *l57->getOutput(0), 256, 1, 1, 0, 58); auto l59 = convBnMish(network, weightMap, *l58->getOutput(0), 256, 1, 1, 0, 59); auto l60 = convBnMish(network, weightMap, *l59->getOutput(0), 256, 3, 1, 1, 60); auto ew61 = network->addElementWise(*l60->getOutput(0), *l58->getOutput(0), ElementWiseOperation::kSUM); auto l62 = convBnMish(network, weightMap, *ew61->getOutput(0), 256, 1, 1, 0, 62); auto l63 = convBnMish(network, weightMap, *l62->getOutput(0), 256, 3, 1, 1, 63); auto ew64 = network->addElementWise(*l63->getOutput(0), *ew61->getOutput(0), ElementWiseOperation::kSUM); auto l65 = convBnMish(network, weightMap, *ew64->getOutput(0), 256, 1, 1, 0, 65); auto l66 = convBnMish(network, weightMap, *l65->getOutput(0), 256, 3, 1, 1, 66); auto ew67 = network->addElementWise(*l66->getOutput(0), *ew64->getOutput(0), ElementWiseOperation::kSUM); auto l68 = convBnMish(network, weightMap, *ew67->getOutput(0), 256, 1, 1, 0, 68); auto l69 = convBnMish(network, weightMap, *l68->getOutput(0), 256, 3, 1, 1, 69); auto ew70 = network->addElementWise(*l69->getOutput(0), *ew67->getOutput(0), ElementWiseOperation::kSUM); auto l71 = convBnMish(network, weightMap, *ew70->getOutput(0), 256, 1, 1, 0, 71); auto l72 = convBnMish(network, weightMap, *l71->getOutput(0), 256, 3, 1, 1, 72); auto ew73 = network->addElementWise(*l72->getOutput(0), *ew70->getOutput(0), ElementWiseOperation::kSUM); auto l74 = convBnMish(network, weightMap, *ew73->getOutput(0), 256, 1, 1, 0, 74); auto l75 = convBnMish(network, weightMap, *l74->getOutput(0), 256, 3, 1, 1, 75); auto ew76 = network->addElementWise(*l75->getOutput(0), *ew73->getOutput(0), ElementWiseOperation::kSUM); auto l77 = convBnMish(network, weightMap, *ew76->getOutput(0), 256, 1, 1, 0, 77); auto l78 = convBnMish(network, weightMap, *l77->getOutput(0), 256, 3, 1, 1, 78); auto ew79 = network->addElementWise(*l78->getOutput(0), *ew76->getOutput(0), ElementWiseOperation::kSUM); auto l80 = convBnMish(network, weightMap, *ew79->getOutput(0), 256, 1, 1, 0, 80); auto l81 = convBnMish(network, weightMap, *l80->getOutput(0), 256, 3, 1, 1, 81); auto ew82 = network->addElementWise(*l81->getOutput(0), *ew79->getOutput(0), ElementWiseOperation::kSUM); auto l83 = convBnMish(network, weightMap, *ew82->getOutput(0), 256, 1, 1, 0, 83); ITensor* inputTensors84[] = {l83->getOutput(0), l56->getOutput(0)}; auto cat84 = network->addConcatenation(inputTensors84, 2); auto l85 = convBnMish(network, weightMap, *cat84->getOutput(0), 512, 1, 1, 0, 85); auto l86 = convBnMish(network, weightMap, *l85->getOutput(0), 1024, 3, 2, 1, 86); auto l87 = convBnMish(network, weightMap, *l86->getOutput(0), 512, 1, 1, 0, 87); auto l88 = l86; auto l89 = convBnMish(network, weightMap, *l88->getOutput(0), 512, 1, 1, 0, 89); auto l90 = convBnMish(network, weightMap, *l89->getOutput(0), 512, 1, 1, 0, 90); auto l91 = convBnMish(network, weightMap, *l90->getOutput(0), 512, 3, 1, 1, 91); auto ew92 = network->addElementWise(*l91->getOutput(0), *l89->getOutput(0), ElementWiseOperation::kSUM); auto l93 = convBnMish(network, weightMap, *ew92->getOutput(0), 512, 1, 1, 0, 93); auto l94 = convBnMish(network, weightMap, *l93->getOutput(0), 512, 3, 1, 1, 94); auto ew95 = network->addElementWise(*l94->getOutput(0), *ew92->getOutput(0), ElementWiseOperation::kSUM); auto l96 = convBnMish(network, weightMap, *ew95->getOutput(0), 512, 1, 1, 0, 96); auto l97 = convBnMish(network, weightMap, *l96->getOutput(0), 512, 3, 1, 1, 97); auto ew98 = network->addElementWise(*l97->getOutput(0), *ew95->getOutput(0), ElementWiseOperation::kSUM); auto l99 = convBnMish(network, weightMap, *ew98->getOutput(0), 512, 1, 1, 0, 99); auto l100 = convBnMish(network, weightMap, *l99->getOutput(0), 512, 3, 1, 1, 100); auto ew101 = network->addElementWise(*l100->getOutput(0), *ew98->getOutput(0), ElementWiseOperation::kSUM); auto l102 = convBnMish(network, weightMap, *ew101->getOutput(0), 512, 1, 1, 0, 102); ITensor* inputTensors103[] = {l102->getOutput(0), l87->getOutput(0)}; auto cat103 = network->addConcatenation(inputTensors103, 2); auto l104 = convBnMish(network, weightMap, *cat103->getOutput(0), 1024, 1, 1, 0, 104); // --------- auto l105 = convBnLeaky(network, weightMap, *l104->getOutput(0), 512, 1, 1, 0, 105); auto l106 = convBnLeaky(network, weightMap, *l105->getOutput(0), 1024, 3, 1, 1, 106); auto l107 = convBnLeaky(network, weightMap, *l106->getOutput(0), 512, 1, 1, 0, 107); auto pool108 = network->addPoolingNd(*l107->getOutput(0), PoolingType::kMAX, DimsHW{5, 5}); pool108->setPaddingNd(DimsHW{2, 2}); pool108->setStrideNd(DimsHW{1, 1}); auto l109 = l107; auto pool110 = network->addPoolingNd(*l109->getOutput(0), PoolingType::kMAX, DimsHW{9, 9}); pool110->setPaddingNd(DimsHW{4, 4}); pool110->setStrideNd(DimsHW{1, 1}); auto l111 = l107; auto pool112 = network->addPoolingNd(*l111->getOutput(0), PoolingType::kMAX, DimsHW{13, 13}); pool112->setPaddingNd(DimsHW{6, 6}); pool112->setStrideNd(DimsHW{1, 1}); ITensor* inputTensors113[] = {pool112->getOutput(0), pool110->getOutput(0), pool108->getOutput(0), l107->getOutput(0)}; auto cat113 = network->addConcatenation(inputTensors113, 4); auto l114 = convBnLeaky(network, weightMap, *cat113->getOutput(0), 512, 1, 1, 0, 114); auto l115 = convBnLeaky(network, weightMap, *l114->getOutput(0), 1024, 3, 1, 1, 115); auto l116 = convBnLeaky(network, weightMap, *l115->getOutput(0), 512, 1, 1, 0, 116); auto l117 = convBnLeaky(network, weightMap, *l116->getOutput(0), 256, 1, 1, 0, 117); float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); for (int i = 0; i < 256 * 2 * 2; i++) { deval[i] = 1.0; } Weights deconvwts118{DataType::kFLOAT, deval, 256 * 2 * 2}; IDeconvolutionLayer* deconv118 = network->addDeconvolutionNd(*l117->getOutput(0), 256, DimsHW{2, 2}, deconvwts118, emptywts); assert(deconv118); deconv118->setStrideNd(DimsHW{2, 2}); deconv118->setNbGroups(256); weightMap["deconv118"] = deconvwts118; auto l119 = l85; auto l120 = convBnLeaky(network, weightMap, *l119->getOutput(0), 256, 1, 1, 0, 120); ITensor* inputTensors121[] = {l120->getOutput(0), deconv118->getOutput(0)}; auto cat121 = network->addConcatenation(inputTensors121, 2); auto l122 = convBnLeaky(network, weightMap, *cat121->getOutput(0), 256, 1, 1, 0, 122); auto l123 = convBnLeaky(network, weightMap, *l122->getOutput(0), 512, 3, 1, 1, 123); auto l124 = convBnLeaky(network, weightMap, *l123->getOutput(0), 256, 1, 1, 0, 124); auto l125 = convBnLeaky(network, weightMap, *l124->getOutput(0), 512, 3, 1, 1, 125); auto l126 = convBnLeaky(network, weightMap, *l125->getOutput(0), 256, 1, 1, 0, 126); auto l127 = convBnLeaky(network, weightMap, *l126->getOutput(0), 128, 1, 1, 0, 127); Weights deconvwts128{DataType::kFLOAT, deval, 128 * 2 * 2}; IDeconvolutionLayer* deconv128 = network->addDeconvolutionNd(*l127->getOutput(0), 128, DimsHW{2, 2}, deconvwts128, emptywts); assert(deconv128); deconv128->setStrideNd(DimsHW{2, 2}); deconv128->setNbGroups(128); auto l129 = l54; auto l130 = convBnLeaky(network, weightMap, *l129->getOutput(0), 128, 1, 1, 0, 130); ITensor* inputTensors131[] = {l130->getOutput(0), deconv128->getOutput(0)}; auto cat131 = network->addConcatenation(inputTensors131, 2); auto l132 = convBnLeaky(network, weightMap, *cat131->getOutput(0), 128, 1, 1, 0, 132); auto l133 = convBnLeaky(network, weightMap, *l132->getOutput(0), 256, 3, 1, 1, 133); auto l134 = convBnLeaky(network, weightMap, *l133->getOutput(0), 128, 1, 1, 0, 134); auto l135 = convBnLeaky(network, weightMap, *l134->getOutput(0), 256, 3, 1, 1, 135); auto l136 = convBnLeaky(network, weightMap, *l135->getOutput(0), 128, 1, 1, 0, 136); auto l137 = convBnLeaky(network, weightMap, *l136->getOutput(0), 256, 3, 1, 1, 137); IConvolutionLayer* conv138 = network->addConvolutionNd(*l137->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.138.Conv2d.weight"], weightMap["module_list.138.Conv2d.bias"]); assert(conv138); // 139 is yolo layer auto l140 = l136; auto l141 = convBnLeaky(network, weightMap, *l140->getOutput(0), 256, 3, 2, 1, 141); ITensor* inputTensors142[] = {l141->getOutput(0), l126->getOutput(0)}; auto cat142 = network->addConcatenation(inputTensors142, 2); auto l143 = convBnLeaky(network, weightMap, *cat142->getOutput(0), 256, 1, 1, 0, 143); auto l144 = convBnLeaky(network, weightMap, *l143->getOutput(0), 512, 3, 1, 1, 144); auto l145 = convBnLeaky(network, weightMap, *l144->getOutput(0), 256, 1, 1, 0, 145); auto l146 = convBnLeaky(network, weightMap, *l145->getOutput(0), 512, 3, 1, 1, 146); auto l147 = convBnLeaky(network, weightMap, *l146->getOutput(0), 256, 1, 1, 0, 147); auto l148 = convBnLeaky(network, weightMap, *l147->getOutput(0), 512, 3, 1, 1, 148); IConvolutionLayer* conv149 = network->addConvolutionNd(*l148->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.149.Conv2d.weight"], weightMap["module_list.149.Conv2d.bias"]); assert(conv149); // 150 is yolo layer auto l151 = l147; auto l152 = convBnLeaky(network, weightMap, *l151->getOutput(0), 512, 3, 2, 1, 152); ITensor* inputTensors153[] = {l152->getOutput(0), l116->getOutput(0)}; auto cat153 = network->addConcatenation(inputTensors153, 2); auto l154 = convBnLeaky(network, weightMap, *cat153->getOutput(0), 512, 1, 1, 0, 154); auto l155 = convBnLeaky(network, weightMap, *l154->getOutput(0), 1024, 3, 1, 1, 155); auto l156 = convBnLeaky(network, weightMap, *l155->getOutput(0), 512, 1, 1, 0, 156); auto l157 = convBnLeaky(network, weightMap, *l156->getOutput(0), 1024, 3, 1, 1, 157); auto l158 = convBnLeaky(network, weightMap, *l157->getOutput(0), 512, 1, 1, 0, 158); auto l159 = convBnLeaky(network, weightMap, *l158->getOutput(0), 1024, 3, 1, 1, 159); IConvolutionLayer* conv160 = network->addConvolutionNd(*l159->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.160.Conv2d.weight"], weightMap["module_list.160.Conv2d.bias"]); assert(conv160); // 161 is yolo layer auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const PluginFieldCollection* pluginData = creator->getFieldNames(); IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); ITensor* inputTensors_yolo[] = {conv138->getOutput(0), conv149->getOutput(0), conv160->getOutput(0)}; auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #ifdef USE_FP16 config->setFlag(BuilderFlag::kFP16); #endif std::cout << "Building tensorrt engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*) (mem.second.values)); } return engine; } void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); assert(engine != nullptr); // Serialize the engine (*modelStream) = engine->serialize(); // Close everything down engine->destroy(); builder->destroy(); config->destroy(); } void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); } int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory* modelStream{nullptr}; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p("yolov4.engine", std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file("yolov4.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov4 -s // serialize model to plan file" << std::endl; std::cerr << "./yolov4 -d ../samples // deserialize plan file and run inference" << std::endl; return -1; } std::vector file_names; if (read_files_in_dir(argv[2], file_names) < 0) { std::cout << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; int fcount = 0; for (int f = 0; f < (int)file_names.size(); f++) { fcount++; if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); if (img.empty()) continue; cv::Mat pr_img = preprocess_img(img); for (int i = 0; i < INPUT_H * INPUT_W; i++) { data[b * 3 * INPUT_H * INPUT_W + i] = pr_img.at(i)[2] / 255.0; data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; } } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, data, prob, BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector> batch_res(fcount); for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; nms(res, &prob[b * OUTPUT_SIZE]); } for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; //std::cout << res.size() << std::endl; cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); for (size_t j = 0; j < res.size(); j++) { //float *p = (float*)&res[j]; //for (size_t k = 0; k < 7; k++) { // std::cout << p[k] << ", "; //} //std::cout << std::endl; cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite("_" + file_names[f - fcount + 1 + b], img); } fcount = 0; } // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); //Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << i / 10 << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov5/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov5) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) # TODO(Call for PR): make cmake compatible with Windows set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt # TODO(Call for PR): make TRT path configurable from command line include_directories(/home/nvidia/TensorRT-8.2.5.1/include/) link_directories(/home/nvidia/TensorRT-8.2.5.1/lib/) include_directories(${PROJECT_SOURCE_DIR}/src/) include_directories(${PROJECT_SOURCE_DIR}/plugin/) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu) add_library(myplugins SHARED ${PLUGIN_SRCS}) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(yolov5_det yolov5_det.cpp ${SRCS}) target_link_libraries(yolov5_det nvinfer) target_link_libraries(yolov5_det cudart) target_link_libraries(yolov5_det myplugins) target_link_libraries(yolov5_det ${OpenCV_LIBS}) add_executable(yolov5_cls yolov5_cls.cpp ${SRCS}) target_link_libraries(yolov5_cls nvinfer) target_link_libraries(yolov5_cls cudart) target_link_libraries(yolov5_cls myplugins) target_link_libraries(yolov5_cls ${OpenCV_LIBS}) add_executable(yolov5_seg yolov5_seg.cpp ${SRCS}) target_link_libraries(yolov5_seg nvinfer) target_link_libraries(yolov5_seg cudart) target_link_libraries(yolov5_seg myplugins) target_link_libraries(yolov5_seg ${OpenCV_LIBS}) ================================================ FILE: yolov5/README.md ================================================ # YOLOv5 TensorRTx inference code base for [ultralytics/yolov5](https://github.com/ultralytics/yolov5). ## Contributors ## Different versions of yolov5 Currently, we support yolov5 v1.0, v2.0, v3.0, v3.1, v4.0, v5.0, v6.0, v6.2, v7.0 - For yolov5 v7.0, download .pt from [yolov5 release v7.0](https://github.com/ultralytics/yolov5/releases/tag/v7.0), `git clone -b v7.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v7.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v7.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v7.0/yolov5) - For yolov5 v6.2, download .pt from [yolov5 release v6.2](https://github.com/ultralytics/yolov5/releases/tag/v6.2), `git clone -b v6.2 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v6.2 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v6.2](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v6.2/yolov5) - For yolov5 v6.0, download .pt from [yolov5 release v6.0](https://github.com/ultralytics/yolov5/releases/tag/v6.0), `git clone -b v6.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v6.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v6.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v6.0/yolov5). - For yolov5 v5.0, download .pt from [yolov5 release v5.0](https://github.com/ultralytics/yolov5/releases/tag/v5.0), `git clone -b v5.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v5.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v5.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v5.0/yolov5). - For yolov5 v4.0, download .pt from [yolov5 release v4.0](https://github.com/ultralytics/yolov5/releases/tag/v4.0), `git clone -b v4.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v4.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v4.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v4.0/yolov5). - For yolov5 v3.1, download .pt from [yolov5 release v3.1](https://github.com/ultralytics/yolov5/releases/tag/v3.1), `git clone -b v3.1 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v3.1 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v3.1](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v3.1/yolov5). - For yolov5 v3.0, download .pt from [yolov5 release v3.0](https://github.com/ultralytics/yolov5/releases/tag/v3.0), `git clone -b v3.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v3.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v3.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v3.0/yolov5). - For yolov5 v2.0, download .pt from [yolov5 release v2.0](https://github.com/ultralytics/yolov5/releases/tag/v2.0), `git clone -b v2.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v2.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v2.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v2.0/yolov5). - For yolov5 v1.0, download .pt from [yolov5 release v1.0](https://github.com/ultralytics/yolov5/releases/tag/v1.0), `git clone -b v1.0 https://github.com/ultralytics/yolov5.git` and `git clone -b yolov5-v1.0 https://github.com/wang-xinyu/tensorrtx.git`, then follow how-to-run in [tensorrtx/yolov5-v1.0](https://github.com/wang-xinyu/tensorrtx/tree/yolov5-v1.0/yolov5). ## Config - Choose the YOLOv5 sub-model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments. - Other configs please check [src/config.h](src/config.h) ## Build and Run ### Detection 1. generate .wts from pytorch with .pt, or download .wts from model zoo ``` git clone -b v7.0 https://github.com/ultralytics/yolov5.git git clone -b yolov5-v7.0 https://github.com/wang-xinyu/tensorrtx.git cd yolov5/ wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py . python gen_wts.py -w yolov5s.pt -o yolov5s.wts # A file 'yolov5s.wts' will be generated. ``` 2. build tensorrtx/yolov5 and run ``` cd [PATH-TO-TENSORRTX]/yolov5/ # Update kNumClass in src/config.h if your model is trained on custom dataset mkdir build cd build cp [PATH-TO-ultralytics-yolov5]/yolov5s.wts . cmake .. make ./yolov5_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw] // serialize model to plan file ./yolov5_det -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. # For example yolov5s ./yolov5_det -s yolov5s.wts yolov5s.engine s ./yolov5_det -d yolov5s.engine ../images # For example Custom model with depth_multiple=0.17, width_multiple=0.25 in yolov5.yaml ./yolov5_det -s yolov5_custom.wts yolov5.engine c 0.17 0.25 ./yolov5_det -d yolov5.engine ../images ``` 3. Check the images generated, _zidane.jpg and _bus.jpg 4. Optional, load and run the tensorrt model in Python ``` // Install python-tensorrt, pycuda, etc. // Ensure the yolov5s.engine and libmyplugins.so have been built python yolov5_det_trt.py // Another version of python script, which is using CUDA Python instead of pycuda. python yolov5_det_trt_cuda_python.py ```

### Classification ``` # Download ImageNet labels wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt # Build and serialize TensorRT engine ./yolov5_cls -s yolov5s-cls.wts yolov5s-cls.engine s # Run inference ./yolov5_cls -d yolov5s-cls.engine ../images ``` ### Instance Segmentation ``` # Build and serialize TensorRT engine ./yolov5_seg -s yolov5s-seg.wts yolov5s-seg.engine s # Download the labels file wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt # Run inference with labels file ./yolov5_seg -d yolov5s-seg.engine ../images coco.txt ```

# INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolov5/build 3. set the macro `USE_INT8` in src/config.h and make 4. serialize the model and test ## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov5/gen_wts.py ================================================ import argparse import os import struct import torch from utils.torch_utils import select_device def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') device = select_device('cpu') model = torch.load(pt_file, map_location=device, weights_only=False) # Load FP32 weights model = model['ema' if model.get('ema') else 'model'].float() if m_type in ['detect', 'seg']: # update anchor_grid info anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] # model.model[-1].anchor_grid = anchor_grid delattr(model.model[-1], 'anchor_grid') # model.model[-1] is detect layer # The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight. model.model[-1].register_buffer("anchor_grid", anchor_grid) model.model[-1].register_buffer("strides", model.model[-1].stride) model.to(device).eval() print(f'Writing into {wts_file}') with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov5/plugin/yololayer.cu ================================================ #include "yololayer.h" #include "cuda_utils.h" #include #include #include namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector& vYoloKernel) { mClassCount = classCount; mYoloV5NetWidth = netWidth; mYoloV5NetHeight = netHeight; mMaxOutObject = maxOut; is_segmentation_ = is_segmentation; mYoloKernel = vYoloKernel; mKernelCount = vYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* kNumAnchor * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaFree(mAnchor[ii])); } CUDA_CHECK(cudaFreeHost(mAnchor)); } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); read(d, mYoloV5NetWidth); read(d, mYoloV5NetHeight); read(d, mMaxOutObject); read(d, is_segmentation_); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(mYoloKernel.data(), d, kernelSize); d += kernelSize; CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* kNumAnchor * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); write(d, mYoloV5NetWidth); write(d, mYoloV5NetHeight); write(d, mMaxOutObject); write(d, is_segmentation_); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(d, mYoloKernel.data(), kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { size_t s = sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount); s += sizeof(YoloKernel) * mYoloKernel.size(); s += sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight); s += sizeof(mMaxOutObject) + sizeof(is_segmentation_); return s; } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT {} // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {} // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, is_segmentation_, mYoloKernel); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int noElements, const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[kNumAnchor * 2], int classes, int outputElem, bool is_segmentation) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid * bnIdx; int info_len_i = 5 + classes; if (is_segmentation) info_len_i += 32; const float* curInput = input + bnIdx * (info_len_i * total_grid * kNumAnchor); for (int k = 0; k < kNumAnchor; ++k) { float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (box_prob < kIgnoreThresh) continue; int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < 5 + classes; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float *res_count = output + bnIdx * outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= maxoutobject) return; char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection); Detection *det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth; det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight; det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k]; det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1]; det->conf = box_prob * max_cls_prob; det->class_id = class_id; for (int i = 0; is_segmentation && i < 32; i++) { det->mask[i] = curInput[idx + k * info_len_i * total_grid + (i + 5 + classes) * total_grid]; } } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; for (unsigned int i = 0; i < mYoloKernel.size(); ++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width * yolo.height * batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> > (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem, is_segmentation_); } } int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 2); assert(strcmp(fc->fields[0].name, "netinfo") == 0); assert(strcmp(fc->fields[1].name, "kernels") == 0); int *p_netinfo = (int*)(fc->fields[0].data); int class_count = p_netinfo[0]; int input_w = p_netinfo[1]; int input_h = p_netinfo[2]; int max_output_object_count = p_netinfo[3]; bool is_segmentation = (bool)p_netinfo[4]; std::vector kernels(fc->fields[1].length); memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(YoloKernel)); YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation, kernels); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov5/plugin/yololayer.h ================================================ #pragma once #include "types.h" #include "macros.h" #include #include namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const std::vector& vYoloKernel); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {}; virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1); int mThreadCount = 256; const char* mPluginNamespace; int mKernelCount; int mClassCount; int mYoloV5NetWidth; int mYoloV5NetHeight; int mMaxOutObject; bool is_segmentation_; std::vector mYoloKernel; void** mAnchor; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; ================================================ FILE: yolov5/src/calibrator.cpp ================================================ #include "calibrator.h" #include "cuda_utils.h" #include "utils.h" #include #include #include #include #include cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov5/src/calibrator.h ================================================ #pragma once #include "macros.h" #include #include #include cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h); //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; ================================================ FILE: yolov5/src/config.h ================================================ #pragma once /* -------------------------------------------------------- * These configs are related to tensorrt model, if these are changed, * please re-compile and re-serialize the tensorrt model. * --------------------------------------------------------*/ // For INT8, you need prepare the calibration dataset, please refer to // https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 // These are used to define input/output tensor names, // you can set them to whatever you want. const static char* kInputTensorName = "data"; const static char* kOutputTensorName = "prob"; // Detection model and Segmentation model' number of classes constexpr static int kNumClass = 80; // Classfication model's number of classes constexpr static int kClsNumClass = 1000; constexpr static int kBatchSize = 1; // Yolo's input width and height must by divisible by 32 constexpr static int kInputH = 640; constexpr static int kInputW = 640; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; // Maximum number of output bounding boxes from yololayer plugin. // That is maximum number of output bounding boxes before NMS. constexpr static int kMaxNumOutputBbox = 1000; constexpr static int kNumAnchor = 3; // The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin. constexpr static float kIgnoreThresh = 0.1f; /* -------------------------------------------------------- * These configs are NOT related to tensorrt model, if these are changed, * please re-compile, but no need to re-serialize the tensorrt model. * --------------------------------------------------------*/ // NMS overlapping thresh and final detection confidence thresh const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static int kGpuId = 0; // If your image size is larger than 4096 * 3112, please increase this value const static int kMaxInputImageSize = 4096 * 3112; ================================================ FILE: yolov5/src/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov5/src/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov5/src/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov5/src/model.cpp ================================================ #include "model.h" #include "calibrator.h" #include "config.h" #include "yololayer.h" #include #include #include #include #include #include using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] static std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } static int get_width(int x, float gw, int divisor = 8) { return int(ceil((x * gw) / divisor)) * divisor; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) { --r; } return std::max(r, 1); } static IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } static ILayer* convBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int p = ksize / 3; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); conv1->setNbGroups(g); conv1->setName((lname + ".conv").c_str()); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); // silu = x * sigmoid auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID); assert(sig); auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD); assert(ew); return ew; } static ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); return conv; } static ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); if (shortcut && c1 == c2) { auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); return ew; } return cv2; } static ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int c_ = (int)((float)c2 * e); auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts); ITensor* y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts); ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); return cv4; } static ILayer* C3(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { int c_ = (int)((float)c2 * e); auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2"); ITensor *y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } ITensor* inputTensors[] = { y1, cv2->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3"); return cv3; } static ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { int c_ = c1 / 2; auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 }); pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 }); pool1->setStrideNd(DimsHW{ 1, 1 }); auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 }); pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 }); pool2->setStrideNd(DimsHW{ 1, 1 }); auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 }); pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 }); pool3->setStrideNd(DimsHW{ 1, 1 }); ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); return cv2; } static ILayer* SPPF(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k, std::string lname) { int c_ = c1 / 2; auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k }); pool1->setPaddingNd(DimsHW{ k / 2, k / 2 }); pool1->setStrideNd(DimsHW{ 1, 1 }); auto pool2 = network->addPoolingNd(*pool1->getOutput(0), PoolingType::kMAX, DimsHW{ k, k }); pool2->setPaddingNd(DimsHW{ k / 2, k / 2 }); pool2->setStrideNd(DimsHW{ 1, 1 }); auto pool3 = network->addPoolingNd(*pool2->getOutput(0), PoolingType::kMAX, DimsHW{ k, k }); pool3->setPaddingNd(DimsHW{ k / 2, k / 2 }); pool3->setStrideNd(DimsHW{ 1, 1 }); ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); return cv2; } static ILayer* Proto(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c_, int c2, std::string lname) { auto cv1 = convBlock(network, weightMap, input, c_, 3, 1, 1, lname + ".cv1"); auto upsample = network->addResize(*cv1->getOutput(0)); assert(upsample); upsample->setResizeMode(ResizeMode::kNEAREST); const float scales[] = {1, 2, 2}; upsample->setScales(scales, 3); auto cv2 = convBlock(network, weightMap, *upsample->getOutput(0), c_, 3, 1, 1, lname + ".cv2"); auto cv3 = convBlock(network, weightMap, *cv2->getOutput(0), c2, 1, 1, 1, lname + ".cv3"); assert(cv3); return cv3; } static std::vector> getAnchors(std::map& weightMap, std::string lname) { std::vector> anchors; Weights wts = weightMap[lname + ".anchor_grid"]; int anchor_len = kNumAnchor * 2; for (int i = 0; i < wts.count / anchor_len; i++) { auto *p = (const float*)wts.values + i * anchor_len; std::vector anchor(p, p + anchor_len); anchors.push_back(anchor); } return anchors; } static IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map& weightMap, std::string lname, std::vector dets, bool is_segmentation = false) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); auto anchors = getAnchors(weightMap, lname); PluginField plugin_fields[2]; int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, (int)is_segmentation}; plugin_fields[0].data = netinfo; plugin_fields[0].length = 5; plugin_fields[0].name = "netinfo"; plugin_fields[0].type = PluginFieldType::kFLOAT32; //load strides from Detect layer assert(weightMap.find(lname + ".strides") != weightMap.end() && "Not found `strides`, please check gen_wts.py!!!"); Weights strides = weightMap[lname + ".strides"]; auto *p = (const float*)(strides.values); std::vector scales(p, p + strides.count); std::vector kernels; for (size_t i = 0; i < anchors.size(); i++) { YoloKernel kernel; kernel.width = kInputW / scales[i]; kernel.height = kInputH / scales[i]; memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float)); kernels.push_back(kernel); } plugin_fields[1].data = &kernels[0]; plugin_fields[1].length = kernels.size(); plugin_fields[1].name = "kernels"; plugin_fields[1].type = PluginFieldType::kFLOAT32; PluginFieldCollection plugin_data; plugin_data.nbFields = 2; plugin_data.fields = plugin_fields; IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data); std::vector input_tensors; for (auto det: dets) { input_tensors.push_back(det->getOutput(0)); } auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); return yolo; } ICudaEngine* build_det_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, kInputH, kInputW} ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); std::map weightMap = loadWeights(wts_name); // Backbone auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0"); assert(conv0); auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1"); auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2"); auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3"); auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4"); auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5"); auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6"); auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7"); auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8"); auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9"); // Head auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10"); auto upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(ResizeMode::kNEAREST); upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions()); ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) }; auto cat12 = network->addConcatenation(inputTensors12, 2); auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13"); auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14"); auto upsample15 = network->addResize(*conv14->getOutput(0)); assert(upsample15); upsample15->setResizeMode(ResizeMode::kNEAREST); upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions()); ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) }; auto cat16 = network->addConcatenation(inputTensors16, 2); auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17"); // Detect IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18"); ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; auto cat19 = network->addConcatenation(inputTensors19, 2); auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20"); IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21"); ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; auto cat22 = network->addConcatenation(inputTensors22, 2); auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23"); IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor of shape {3, kInputH, kInputW} ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); std::map weightMap = loadWeights(wts_name); // Backbone auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0"); auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1"); auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2"); auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3"); auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4"); auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5"); auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6"); auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7"); auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8"); auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9"); auto c3_10 = C3(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.10"); auto sppf11 = SPPF(network, weightMap, *c3_10->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.11"); // Head auto conv12 = convBlock(network, weightMap, *sppf11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12"); auto upsample13 = network->addResize(*conv12->getOutput(0)); assert(upsample13); upsample13->setResizeMode(ResizeMode::kNEAREST); upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions()); ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) }; auto cat14 = network->addConcatenation(inputTensors14, 2); auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15"); auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16"); auto upsample17 = network->addResize(*conv16->getOutput(0)); assert(upsample17); upsample17->setResizeMode(ResizeMode::kNEAREST); upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions()); ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) }; auto cat18 = network->addConcatenation(inputTensors18, 2); auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19"); auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20"); auto upsample21 = network->addResize(*conv20->getOutput(0)); assert(upsample21); upsample21->setResizeMode(ResizeMode::kNEAREST); upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions()); ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) }; auto cat22 = network->addConcatenation(inputTensors21, 2); auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23"); auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24"); ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) }; auto cat25 = network->addConcatenation(inputTensors25, 2); auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26"); auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27"); ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) }; auto cat28 = network->addConcatenation(inputTensors28, 2); auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29"); auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30"); ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) }; auto cat31 = network->addConcatenation(inputTensors31, 2); auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32"); // Detect IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]); IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]); IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]); IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector{det0, det1, det2, det3}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } ICudaEngine* build_cls_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); // Create input tensor ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kClsInputH, kClsInputW }); assert(data); std::map weightMap = loadWeights(wts_name); // Backbone auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0"); assert(conv0); auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1"); auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2"); auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3"); auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4"); auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5"); auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6"); auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7"); auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8"); // Head auto conv_class = convBlock(network, weightMap, *bottleneck_csp8->getOutput(0), 1280, 1, 1, 1, "model.9.conv"); int k = kClsInputH / 32; IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), PoolingType::kAVERAGE, DimsHW{ k, k }); assert(pool2); IFullyConnectedLayer* yolo = network->addFullyConnected(*pool2->getOutput(0), kClsNumClass, weightMap["model.9.linear.weight"], weightMap["model.9.linear.bias"]); assert(yolo); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputW, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } ICudaEngine* build_seg_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); std::map weightMap = loadWeights(wts_name); // Backbone auto conv0 = convBlock(network, weightMap, *data, get_width(64, gw), 6, 2, 1, "model.0"); assert(conv0); auto conv1 = convBlock(network, weightMap, *conv0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1"); auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2"); auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3"); auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(6, gd), true, 1, 0.5, "model.4"); auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5"); auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6"); auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7"); auto bottleneck_csp8 = C3(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), true, 1, 0.5, "model.8"); auto spp9 = SPPF(network, weightMap, *bottleneck_csp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, "model.9"); // Head auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10"); auto upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(ResizeMode::kNEAREST); upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions()); ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) }; auto cat12 = network->addConcatenation(inputTensors12, 2); auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13"); auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14"); auto upsample15 = network->addResize(*conv14->getOutput(0)); assert(upsample15); upsample15->setResizeMode(ResizeMode::kNEAREST); upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions()); ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) }; auto cat16 = network->addConcatenation(inputTensors16, 2); auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17"); // Segmentation IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), kNumAnchor * (32 + kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18"); ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; auto cat19 = network->addConcatenation(inputTensors19, 2); auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20"); IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), kNumAnchor * (32 + kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21"); ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; auto cat22 = network->addConcatenation(inputTensors22, 2); auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23"); IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), kNumAnchor * (32 + kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector{det0, det1, det2}, true); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); auto proto = Proto(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 32, "model.24.proto"); proto->getOutput(0)->setName("proto"); network->markOutput(*proto->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } ================================================ FILE: yolov5/src/model.h ================================================ #pragma once #include #include nvinfer1::ICudaEngine* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); nvinfer1::ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); nvinfer1::ICudaEngine* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); nvinfer1::ICudaEngine* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); ================================================ FILE: yolov5/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2] / 2.f; r = bbox[0] + bbox[2] / 2.f; t = bbox[1] - bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2; b = bbox[1] + bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2; r = bbox[0] + bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2; t = bbox[1] - bbox[3] / 2.f; b = bbox[1] + bbox[3] / 2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(round(l), round(t), round(r - l), round(b - t)); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]); return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); } static bool cmp(const Detection& a, const Detection& b) { return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } static cv::Rect get_downscale_rect(float bbox[4], float scale) { float left = bbox[0] - bbox[2] / 2; float top = bbox[1] - bbox[3] / 2; float right = bbox[0] + bbox[2] / 2; float bottom = bbox[1] + bbox[3] / 2; left /= scale; top /= scale; right /= scale; bottom /= scale; return cv::Rect(round(left), round(top), round(right - left), round(bottom - top)); } std::vector process_mask(const float* proto, int proto_size, std::vector& dets) { std::vector masks; for (size_t i = 0; i < dets.size(); i++) { cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); auto r = get_downscale_rect(dets[i].bbox, 4); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float e = 0.0f; for (int j = 0; j < 32; j++) { e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; } e = 1.0f / (1.0f + expf(-e)); mask_mat.at(y, x) = e; } } cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); masks.push_back(mask_mat); } return masks; } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } ================================================ FILE: yolov5/src/postprocess.h ================================================ #pragma once #include "types.h" #include cv::Rect get_rect(cv::Mat& img, float bbox[4]); void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void draw_bbox(std::vector& img_batch, std::vector>& res_batch); std::vector process_mask(const float* proto, int proto_size, std::vector& dets); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); ================================================ FILE: yolov5/src/preprocess.cu ================================================ #include "preprocess.h" #include "cuda_utils.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; struct AffineMatrix { float value[6]; }; __global__ void warpaffine_kernel( uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess( uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>( img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov5/src/preprocess.h ================================================ #pragma once #include #include #include void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov5/src/types.h ================================================ #pragma once #include "config.h" struct YoloKernel { int width; int height; float anchors[kNumAnchor * 2]; }; struct alignas(float) Detection { float bbox[4]; // center_x center_y w h float conf; // bbox_conf * cls_conf float class_id; float mask[32]; }; ================================================ FILE: yolov5/src/utils.h ================================================ #pragma once #include #include #include #include #include #include #include static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov5/yolov5_cls.cpp ================================================ #include "cuda_utils.h" #include "logging.h" #include "utils.h" #include "model.h" #include "config.h" #include "calibrator.h" #include #include #include #include #include using namespace nvinfer1; static Logger gLogger; const static int kOutputSize = kClsNumClass; void batch_preprocess(std::vector& imgs, float* output) { for (size_t b = 0; b < imgs.size(); b++) { cv::Mat img; // cv::resize(imgs[b], img, cv::Size(kClsInputW, kClsInputH)); img = preprocess_img(imgs[b], kClsInputW, kClsInputH); int i = 0; for (int row = 0; row < img.rows; ++row) { uchar* uc_pixel = img.data + row * img.step; for (int col = 0; col < img.cols; ++col) { output[b * 3 * img.rows * img.cols + i] = ((float)uc_pixel[2] / 255.0 - 0.485) / 0.229; // R - 0.485 output[b * 3 * img.rows * img.cols + i + img.rows * img.cols] = ((float)uc_pixel[1] / 255.0 - 0.456) / 0.224; output[b * 3 * img.rows * img.cols + i + 2 * img.rows * img.cols] = ((float)uc_pixel[0] / 255.0 - 0.406) / 0.225; uc_pixel += 3; ++i; } } } } std::vector softmax(float *prob, int n) { std::vector res; float sum = 0.0f; float t; for (int i = 0; i < n; i++) { t = expf(prob[i]); res.push_back(t); sum += t; } for (int i = 0; i < n; i++) { res[i] /= sum; } return res; } std::vector topk(const std::vector& vec, int k) { std::vector topk_index; std::vector vec_index(vec.size()); std::iota(vec_index.begin(), vec_index.end(), 0); std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; }); int k_num = std::min(vec.size(), k); for (int i = 0; i < k_num; ++i) { topk_index.push_back(vec_index[i]); } return topk_index; } std::vector read_classes(std::string file_name) { std::vector classes; std::ifstream ifs(file_name, std::ios::in); if (!ifs.is_open()) { std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl; assert(0); } std::string s; while (std::getline(ifs, s)) { classes.push_back(s); } ifs.close(); return classes; } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto net = std::string(argv[4]); if (net[0] == 'n') { gd = 0.33; gw = 0.25; } else if (net[0] == 's') { gd = 0.33; gw = 0.50; } else if (net[0] == 'm') { gd = 0.67; gw = 0.75; } else if (net[0] == 'l') { gd = 1.0; gw = 1.0; } else if (net[0] == 'x') { gd = 1.33; gw = 1.25; } else if (net[0] == 'c' && argc == 7) { gd = atof(argv[5]); gw = atof(argv[6]); } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** cpu_output_buffer) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW]; *cpu_output_buffer = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = nullptr; engine = build_cls_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); assert(engine != nullptr); // Serialize the engine IHostMemory* serialized_engine = engine->serialize(); assert(serialized_engine != nullptr); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down engine->destroy(); config->destroy(); serialized_engine->destroy(); builder->destroy(); } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; float gd = 0.0f, gw = 0.0f; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov5_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw] // serialize model to plan file" << std::endl; std::cerr << "./yolov5_cls -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, gd, gw, wts_name, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Prepare cpu and gpu buffers float* gpu_buffers[2]; float* cpu_input_buffer = nullptr; float* cpu_output_buffer = nullptr; prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &cpu_input_buffer, &cpu_output_buffer); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // Read imagenet labels auto classes = read_classes("imagenet_classes.txt"); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess batch_preprocess(img_batch, cpu_input_buffer); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)gpu_buffers, cpu_input_buffer, cpu_output_buffer, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // Postprocess and get top-k result for (size_t b = 0; b < img_name_batch.size(); b++) { float* p = &cpu_output_buffer[b * kOutputSize]; auto res = softmax(p, kOutputSize); auto topk_idx = topk(res, 3); std::cout << img_name_batch[b] << std::endl; for (auto idx: topk_idx) { std::cout << " " << classes[idx] << " " << res[idx] << std::endl; } } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(gpu_buffers[0])); CUDA_CHECK(cudaFree(gpu_buffers[1])); delete[] cpu_input_buffer; delete[] cpu_output_buffer; // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: yolov5/yolov5_cls_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import os import shutil import sys import threading import time import cv2 import numpy as np import torch import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret with open("imagenet_classes.txt") as f: classes = [line.strip() for line in f.readlines()] class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] self.mean = (0.485, 0.456, 0.406) self.std = (0.229, 0.224, 0.225) for binding in engine: print('binding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape( binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_input_image = np.empty( shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): batch_image_raw.append(image_raw) input_image = self.preprocess_cls_image(image_raw) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls( output) cv2.putText(batch_image_raw[i], str( classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) print(classes_ls, predicted_conf_ls) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_cls_image(self, input_img): im = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB) im = cv2.resize(im, (self.input_h, self.input_w)) im = np.float32(im) im /= 255.0 im -= self.mean im /= self.std im = im.transpose(2, 0, 1) # prepare batch batch_data = np.expand_dims(im, axis=0) return batch_data def postprocess_cls(self, output_data): classes_ls = [] predicted_conf_ls = [] category_id_ls = [] output_data = output_data.reshape(self.batch_size, -1) output_data = torch.Tensor(output_data) p = torch.nn.functional.softmax(output_data, dim=1) score, index = torch.topk(p, 3) for ind in range(index.shape[0]): input_category_id = index[ind][0].item() # 716 category_id_ls.append(input_category_id) predicted_confidence = score[ind][0].item() predicted_conf_ls.append(predicted_confidence) classes_ls.append(classes[input_category_id]) return classes_ls, predicted_conf_ls, category_id_ls class inferThread(threading.Thread): def __init__(self, yolov5_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer( self.yolov5_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format( self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer( self.yolov5_wrapper.get_raw_image_zeros()) print( 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine engine_file_path = "build/yolov5s-cls.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) try: print('batch size is', yolov5_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches( yolov5_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov5_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov5_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov5_wrapper.destroy() ================================================ FILE: yolov5/yolov5_det.cpp ================================================ #include "cuda_utils.h" #include "logging.h" #include "utils.h" #include "preprocess.h" #include "postprocess.h" #include "model.h" #include #include #include using namespace nvinfer1; static Logger gLogger; const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, float& gd, float& gw, std::string& img_dir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto net = std::string(argv[4]); if (net[0] == 'n') { gd = 0.33; gw = 0.25; } else if (net[0] == 's') { gd = 0.33; gw = 0.50; } else if (net[0] == 'm') { gd = 0.67; gw = 0.75; } else if (net[0] == 'l') { gd = 1.0; gw = 1.0; } else if (net[0] == 'x') { gd = 1.33; gw = 1.25; } else if (net[0] == 'c' && argc == 7) { gd = atof(argv[5]); gw = atof(argv[6]); } else { return false; } if (net.size() == 2 && net[1] == '6') { is_p6 = true; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_output_buffer) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); *cpu_output_buffer = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** gpu_buffers, float* output, int batchsize) { context.enqueue(batchsize, gpu_buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void serialize_engine(unsigned int max_batchsize, bool& is_p6, float& gd, float& gw, std::string& wts_name, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = nullptr; if (is_p6) { engine = build_det_p6_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); } else { engine = build_det_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); } assert(engine != nullptr); // Serialize the engine IHostMemory* serialized_engine = engine->serialize(); assert(serialized_engine != nullptr); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down engine->destroy(); config->destroy(); serialized_engine->destroy(); builder->destroy(); } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; bool is_p6 = false; float gd = 0.0f, gw = 0.0f; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, is_p6, gd, gw, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov5_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw] // serialize model to plan file" << std::endl; std::cerr << "./yolov5_det -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, is_p6, gd, gw, wts_name, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Init CUDA preprocessing cuda_preprocess_init(kMaxInputImageSize); // Prepare cpu and gpu buffers float* gpu_buffers[2]; float* cpu_output_buffer = nullptr; prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &cpu_output_buffer); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, gpu_buffers[0], kInputW, kInputH, stream); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)gpu_buffers, cpu_output_buffer, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // NMS std::vector> res_batch; batch_nms(res_batch, cpu_output_buffer, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(gpu_buffers[0])); CUDA_CHECK(cudaFree(gpu_buffers[1])); delete[] cpu_output_buffer; cuda_preprocess_destroy(); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < kOutputSize; i++) { // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; // } // std::cout << std::endl; return 0; } ================================================ FILE: yolov5/yolov5_det_cuda_python.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np from cuda import cudart import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() # Create a Stream on this device, _, stream = cudart.cudaStreamCreate() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = np.empty(size, dtype=dtype) _, cuda_mem = cudart.cudaMallocAsync(host_mem.nbytes, stream) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cudart.cudaMemcpyAsync(cuda_inputs[0], host_inputs[0].ctypes.data, host_inputs[0].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream) # Transfer predictions back from the GPU. cudart.cudaMemcpyAsync(host_outputs[0].ctypes.data, cuda_outputs[0], host_outputs[0].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) # Synchronize the stream cudart.cudaStreamSynchronize(stream) end = time.time() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any stream and cuda mem cudart.cudaStreamDestroy(self.stream) cudart.cudaFree(self.cuda_inputs[0]) cudart.cudaFree(self.cuda_outputs[0]) def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 6))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov5_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/yolov5s.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) cudart.cudaDeviceSynchronize() # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) try: print('batch size is', yolov5_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov5_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov5_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov5_wrapper.destroy() ================================================ FILE: yolov5/yolov5_det_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 LEN_ALL_RESULT = 38001 LEN_ONE_RESULT = 38 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * LEN_ALL_RESULT: (i + 1) * LEN_ALL_RESULT], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, LEN_ONE_RESULT))[:num, :] pred = pred[:, :6] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov5_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/yolov5s.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) try: print('batch size is', yolov5_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov5_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov5_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov5_wrapper.destroy() ================================================ FILE: yolov5/yolov5_seg.cpp ================================================ #include "config.h" #include "cuda_utils.h" #include "logging.h" #include "utils.h" #include "preprocess.h" #include "postprocess.h" #include "model.h" #include #include #include using namespace nvinfer1; static Logger gLogger; const static int kOutputSize1 = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; const static int kOutputSize2 = 32 * (kInputH / 4) * (kInputW / 4); bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir, std::string& labels_filename) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto net = std::string(argv[4]); if (net[0] == 'n') { gd = 0.33; gw = 0.25; } else if (net[0] == 's') { gd = 0.33; gw = 0.50; } else if (net[0] == 'm') { gd = 0.67; gw = 0.75; } else if (net[0] == 'l') { gd = 1.0; gw = 1.0; } else if (net[0] == 'x') { gd = 1.33; gw = 1.25; } else if (net[0] == 'c' && argc == 7) { gd = atof(argv[5]); gw = atof(argv[6]); } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); labels_filename = std::string(argv[4]); } else { return false; } return true; } void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer1, float** gpu_output_buffer2, float** cpu_output_buffer1, float** cpu_output_buffer2) { assert(engine->getNbBindings() == 3); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex1 = engine->getBindingIndex(kOutputTensorName); const int outputIndex2 = engine->getBindingIndex("proto"); assert(inputIndex == 0); assert(outputIndex1 == 1); assert(outputIndex2 == 2); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer1, kBatchSize * kOutputSize1 * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer2, kBatchSize * kOutputSize2 * sizeof(float))); // Alloc CPU buffers *cpu_output_buffer1 = new float[kBatchSize * kOutputSize1]; *cpu_output_buffer2 = new float[kBatchSize * kOutputSize2]; } void infer(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* output1, float* output2, int batchSize) { context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output1, buffers[1], batchSize * kOutputSize1 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(output2, buffers[2], batchSize * kOutputSize2 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine ICudaEngine *engine = nullptr; engine = build_seg_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); assert(engine != nullptr); // Serialize the engine IHostMemory* serialized_engine = engine->serialize(); assert(serialized_engine != nullptr); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down engine->destroy(); config->destroy(); serialized_engine->destroy(); builder->destroy(); } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string labels_filename = ""; float gd = 0.0f, gw = 0.0f; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir, labels_filename)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov5_seg -s [.wts] [.engine] [n/s/m/l/x or c gd gw] // serialize model to plan file" << std::endl; std::cerr << "./yolov5_seg -d [.engine] ../images coco.txt // deserialize plan file, read the labels file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, gd, gw, wts_name, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Init CUDA preprocessing cuda_preprocess_init(kMaxInputImageSize); // Prepare cpu and gpu buffers float* gpu_buffers[3]; float* cpu_output_buffer1 = nullptr; float* cpu_output_buffer2 = nullptr; prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &gpu_buffers[2], &cpu_output_buffer1, &cpu_output_buffer2); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // Read the txt file for classnames std::ifstream labels_file(labels_filename, std::ios::binary); if (!labels_file.good()) { std::cerr << "read " << labels_filename << " error!" << std::endl; return -1; } std::unordered_map labels_map; read_labels(labels_filename, labels_map); assert(kNumClass == labels_map.size()); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, gpu_buffers[0], kInputW, kInputH, stream); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)gpu_buffers, cpu_output_buffer1, cpu_output_buffer2, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // NMS std::vector> res_batch; batch_nms(res_batch, cpu_output_buffer1, img_batch.size(), kOutputSize1, kConfThresh, kNmsThresh); // Draw result and save image for (size_t b = 0; b < img_name_batch.size(); b++) { auto& res = res_batch[b]; cv::Mat img = img_batch[b]; auto masks = process_mask(&cpu_output_buffer2[b * kOutputSize2], kOutputSize2, res); draw_mask_bbox(img, res, masks, labels_map); cv::imwrite("_" + img_name_batch[b], img); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(gpu_buffers[0])); CUDA_CHECK(cudaFree(gpu_buffers[1])); CUDA_CHECK(cudaFree(gpu_buffers[2])); delete[] cpu_output_buffer1; delete[] cpu_output_buffer2; cuda_preprocess_destroy(); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: yolov5/yolov5_seg_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size # Data length self.det_output_length = host_outputs[0].shape[0] self.mask_output_length = host_outputs[1].shape[0] self.seg_w = int(self.input_w / 4) self.seg_h = int(self.input_h / 4) self.seg_c = int(self.mask_output_length / (self.seg_w * self.seg_w)) self.det_row_output_length = self.seg_c + 6 # Draw mask self.colors_obj = Colors() def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output_bbox = host_outputs[0] output_proto_mask = host_outputs[1] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid, result_proto_coef = self.post_process( output_bbox[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) if result_proto_coef.shape[0] == 0: continue result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i], batch_origin_w[i]) # Draw masks on the original image self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],im_src=batch_image_raw[i]) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output_boxes, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes, cx, cy, w, h, conf, cls_id, mask[32], cx, cy, w, h, conf, cls_id, mask[32] ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output_boxes[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output_boxes[1:], (-1, self.det_row_output_length))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) result_proto_coef = boxes[:, 6:] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid, result_proto_coef def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id, mask coefficients[32]) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, 5] == boxes[:, 5] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def scale_mask(self, mask, ih, iw): mask = cv2.resize(mask, (self.input_w, self.input_h)) r_w = self.input_w / (iw * 1.0) r_h = self.input_h / (ih * 1.0) if r_h > r_w: w = self.input_w h = int(r_w * ih) x = 0 y = int((self.input_h - h) / 2) else: w = int(r_h * iw) h = self.input_h x = int((self.input_w - w) / 2) y = 0 crop = mask[y:y+h, x:x+w] crop = cv2.resize(crop, (iw, ih)) return crop def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw): """ description: Mask pred by yolov5 instance segmentation , param: output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input result_proto_coef: prototype mask coefficients (n, 32), n represents n results result_boxes : ih: rows of original image iw: cols of original image return: mask_result: (n, ih, iw) """ result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w) c, mh, mw = result_proto_masks.shape masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw) mask_result = [] for mask, box in zip(masks, result_boxes): mask_s = np.zeros((ih, iw)) crop_mask = self.scale_mask(mask, ih, iw) x1 = int(box[0]) y1 = int(box[1]) x2 = int(box[2]) y2 = int(box[3]) crop = crop_mask[y1:y2, x1:x2] crop = np.where(crop >= 0.5, 1, 0) crop = crop.astype(np.uint8) mask_s[y1:y2, x1:x2] = crop mask_result.append(mask_s) mask_result = np.array(mask_result) return mask_result def draw_mask(self, masks, colors_, im_src, alpha=0.5): """ description: Draw mask on image , param: masks : result_mask colors_: color to draw mask im_src : original image alpha : scale between original image and mask return: no return """ if len(masks) == 0: return masks = np.asarray(masks, dtype=np.uint8) masks = np.ascontiguousarray(masks.transpose(1, 2, 0)) masks = np.asarray(masks, dtype=np.float32) colors_ = np.asarray(colors_, dtype=np.float32) s = masks.sum(2, keepdims=True).clip(0, 1) masks = (masks @ colors_).clip(0, 255) im_src[:] = masks * alpha + im_src * (1 - s * alpha) class inferThread(threading.Thread): def __init__(self, yolov5_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) class Colors: def __init__(self): hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7') self.palette = [self.hex2rgb(f'#{c}') for c in hexs] self.n = len(self.palette) def __call__(self, i, bgr=False): c = self.palette[int(i) % self.n] return (c[2], c[1], c[0]) if bgr else c @staticmethod def hex2rgb(h): # rgb order (PIL) return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/yolov5s-seg.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) try: print('batch size is', yolov5_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov5_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov5_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov5_wrapper.destroy() ================================================ FILE: yolov5-lite/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov5-lite) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) if(WIN32) enable_language(CUDA) endif(WIN32) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt # include_directories(/usr/include/x86_64-linux-gnu/) # link_directories(/usr/lib/x86_64-linux-gnu/) include_directories(/opt/TensorRT-8.6.1.6/include) link_directories(/opt/TensorRT-8.6.1.6/lib) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) #add_executable(yolov5 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/main.cpp) add_executable(v5lite ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/v5lite.cpp) target_link_libraries(v5lite nvinfer) target_link_libraries(v5lite cudart) target_link_libraries(v5lite myplugins) target_link_libraries(v5lite ${OpenCV_LIBS}) if(UNIX) add_definitions(-O2 -pthread) endif(UNIX) ================================================ FILE: yolov5-lite/README.md ================================================ # YOLOv5-Lite TensorRT Deployment Detection training code [link](https://github.com/ppogg/YOLOv5-Lite.git) ## Environment TensorRT: 8.6.1.6 CUDA: 12.6 CUDNN: 8.9.0 OpenCV:4.10.0 ## Configuration parameters Before starting, you need to modify parameters in `include/yololayer.h` to match your training configuration (example at `include/yololayer.h`): ```cpp static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; static constexpr int CLASS_NUM = 80; // number of classes static constexpr int INPUT_H = 640; // input height for yolov5-lite (must be divisible by 32) static constexpr int INPUT_W = 640; // input width for yolov5-lite (must be divisible by 32) static constexpr int DEVICE = 0; static constexpr float NMS_THRESH = 0.4; static constexpr float CONF_THRESH = 0.45; static constexpr int BATCH_SIZE = 1; const char* INPUT_BLOB_NAME = "data"; const char* OUTPUT_BLOB_NAME = "prob"; ``` ## 1. Generate .wts from .pt This step must be performed inside the `yolov5-lite` folder: ```bash cd yolov5-lite git clone https://gitcode.com/open-source-toolkit/ac70a.git unzip your zip file python gen_wts.py -w v5lite-s.pt -o v5lite-s.wts python gen_wts.py -w v5lite-e.pt -o v5lite-e.wts python gen_wts.py -w v5lite-g.pt -o v5lite-g.wts ``` ## 2. Build the engine and run inference ### Build steps a. First, set `CLASS_NUM` in `include/yololayer.h` to match your dataset class count — this is important, otherwise you will get errors. b. Run the following commands: ```bash mkdir build cd build cmake .. make ``` ### Generate engine files ```bash ./v5lite -s ../v5lite-s.wts v5lite-s.engine s ./v5lite -s ../v5lite-g.wts v5lite-g.engine g ./v5lite -s ../v5lite-e.wts v5lite-e.engine e ./v5lite -s ../v5lite-c.wts v5lite-c.engine c ``` ### Using the engine for inference (`samples` is the folder containing your images): ```bash ./v5lite -d v5lite-s.engine ../samples ``` You can also use `yolov5-lite-trt.py` (in the repository root) for inference. ## 3. INT8 Quantization ### Preparation 1. Collect calibration images (recommended ~1000 images) 2. Put the images in a calibration folder (for example: `tensorrtx-int8calib-data/coco_calib`) 3. Modify the macro in [v5lite.cpp](yolov5-lite/v5lite.cpp): Change: ```cpp // #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 // #define USE_INT8 // set USE_INT8 or USE_FP16 or USE_FP32 ``` To: ```cpp // #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #define USE_INT8 // set USE_INT8 or USE_FP16 or USE_FP32 ``` 4. Update the data path in the code to point to your calibration images 5. Rebuild and generate the engine, then run inference (repeat step 2) ## Notes - In practice, calling the engine from Python may produce better inference behavior in some cases. ================================================ FILE: yolov5-lite/calibrator.cpp ================================================ #include #include #include #include #include "calibrator.h" #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()){ std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov5-lite/common.hpp ================================================ #ifndef YOLOV5_COMMON_H_ #define YOLOV5_COMMON_H_ #include #include #include #include #include #include "NvInfer.h" #include "yololayer.h" using namespace nvinfer1; cv::Rect get_rect(cv::Mat& img, float bbox[4]) { int l, r, t, b; float r_w = Yolo::INPUT_W / (img.cols * 1.0); float r_h = Yolo::INPUT_H / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2] / 2.f; r = bbox[0] + bbox[2] / 2.f; t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; t = bbox[1] - bbox[3] / 2.f; b = bbox[1] + bbox[3] / 2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(l, t, r - l, b - t); } float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) { std::cout << "The data is questionable!" << std::endl; return 0.0f; } float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]); return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); } bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { return a.conf > b.conf; } void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { int det_size = sizeof(Yolo::Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Yolo::Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { //std::cout << it->second[0].class_id << " --- " << std::endl; auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { /* class Weights { public: DataType type; //!< The type of the weights. void const* values; //!< The weight values, in a contiguous array. int64_t count; //!< The number of weights in the array. }; */ Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } // for (auto it = weightMap.begin(); it != weightMap.end(); it++) { // std::cout << "========= keys: " << it -> first << " =================" << std::endl; // } return weightMap; } nvinfer1::IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float *gamma = (float*)weightMap[lname + ".weight"].values; float *beta = (float*)weightMap[lname + ".bias"].values; float *mean = (float*)weightMap[lname + ".running_mean"].values; float *var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float *scval = reinterpret_cast(malloc(sizeof(float) * len)); // gamma / sqrt(running_var + eps) for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float *shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float *pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } nvinfer1::IPoolingLayer *conv_bn_relu_maxpool(nvinfer1::INetworkDefinition *network, std::map & weightMap, nvinfer1::ITensor &input, int outch, std::string lname){ nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer *conv0 = network->addConvolutionNd(input, outch, nvinfer1::DimsHW{3, 3}, weightMap[lname + "conv.0.weight"], emptywts); conv0->setStrideNd(nvinfer1::DimsHW{2, 2}); conv0->setPaddingNd(nvinfer1::DimsHW{1, 1}); nvinfer1::IScaleLayer * bn1 = addBatchNorm2d(network, weightMap, *conv0->getOutput(0), lname + "conv.1", 1e-3); auto Relu = network->addActivation(*bn1->getOutput(0), nvinfer1::ActivationType::kRELU); assert(Relu); IPoolingLayer *pool = network->addPoolingNd(*Relu->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{3, 3}); pool->setStrideNd(nvinfer1::DimsHW{2, 2}); pool->setPaddingNd(nvinfer1::DimsHW{1, 1}); assert(pool); return pool; } nvinfer1::IElementWiseLayer *HardSwish(nvinfer1::INetworkDefinition *network, nvinfer1::ITensor &input){ auto hsig = network->addActivation(input, ActivationType::kHARD_SIGMOID); hsig->setAlpha(1.0 / 6.0); hsig->setBeta(0.5); auto ew = network->addElementWise(input, *hsig->getOutput(0), ElementWiseOperation::kPROD); return ew; } nvinfer1::IElementWiseLayer *CBH(nvinfer1::INetworkDefinition *network, std::map &weightMap, nvinfer1::ITensor &input, int num_filters, int filter_size, int stride, std::string lname, int num_groups=1){ int pad = (filter_size - 1) / 2; nvinfer1::Weights emptywts {nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer *conv = network->addConvolutionNd(input, num_filters, nvinfer1::DimsHW{filter_size, filter_size}, weightMap[lname + ".conv.weight"], emptywts); conv->setStrideNd(nvinfer1::DimsHW{stride, stride}); conv->setPaddingNd(nvinfer1::DimsHW{pad, pad}); conv->setNbGroups(num_groups); nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IElementWiseLayer *hash = HardSwish(network, *bn->getOutput(0)); nvinfer1::Dims dims = hash->getOutput(0)->getDimensions(); return hash; } nvinfer1::IElementWiseLayer *SiLU(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor& input) { // Create Sigmoid activation layer nvinfer1::IActivationLayer *sig = network->addActivation(input, ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer *mul = network->addElementWise(input, *sig->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); return mul; } nvinfer1::IElementWiseLayer *LC_SEModule(nvinfer1::INetworkDefinition *network, std::map &weightMap, nvinfer1::ITensor &input, int in_channels, std::string lname, int reduction=4){ nvinfer1::IIdentityLayer *identity = network->addIdentity(input); nvinfer1::IReduceLayer *avg_pool = network->addReduce(input, nvinfer1::ReduceOperation::kAVG, (1 << 1) | (1 << 2), true); nvinfer1::IConvolutionLayer *conv1 = network->addConvolutionNd(*avg_pool->getOutput(0), in_channels / reduction, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".conv1.weight"], weightMap[lname + ".conv1.bias"]); nvinfer1::IActivationLayer *relu = network->addActivation(*conv1->getOutput(0), nvinfer1::ActivationType::kRELU); nvinfer1::IConvolutionLayer *conv2 = network->addConvolutionNd(*relu->getOutput(0), in_channels, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".conv2.weight"], weightMap[lname + ".conv2.bias"]); nvinfer1::IElementWiseLayer *silu = SiLU(network, *conv2->getOutput(0)); nvinfer1::IElementWiseLayer *out = network->addElementWise(*silu->getOutput(0), *identity->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); return out; } nvinfer1::IElementWiseLayer *LC_Block(nvinfer1::INetworkDefinition *network, std::map &weightMap, nvinfer1::ITensor &input, int num_channels, int num_filters, int stride, int dw_size, std::string lname, bool use_se=false){ // num_channels : in_channel // num_filters : out_channel // stride:dw_conv's stride // dw_size: dw_conv's filter-size nvinfer1::IElementWiseLayer *dw_conv = CBH(network, weightMap, input, num_channels, dw_size, stride, lname + ".dw_conv", num_channels); if(use_se){ nvinfer1::IElementWiseLayer *se = LC_SEModule(network, weightMap, *dw_conv->getOutput(0), num_channels, lname + ".se"); nvinfer1::IElementWiseLayer *pw_conv = CBH(network, weightMap, *se->getOutput(0), num_filters, 1, 1, lname + ".pw_conv"); return pw_conv; } nvinfer1::IElementWiseLayer *pw_conv = CBH(network, weightMap, *dw_conv->getOutput(0), num_filters, 1, 1, lname + ".pw_conv"); return pw_conv; } nvinfer1::IElementWiseLayer *Dense(nvinfer1::INetworkDefinition *network, std::map &weightMap, nvinfer1::ITensor &input, int num_filters, int filter_size, std::string lname){ nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer *dense_conv = network->addConvolutionNd(input, num_filters, nvinfer1::DimsHW{filter_size, filter_size} , weightMap[lname + ".dense_conv.weight"], emptywts); nvinfer1::IElementWiseLayer *hash = HardSwish(network, *dense_conv->getOutput(0)); nvinfer1::Dims dims_o = hash->getOutput(0)->getDimensions(); return hash; } nvinfer1::IElementWiseLayer* convBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int p = ksize / 3; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); conv1->setNbGroups(g); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); // silu = x * sigmoid auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID); assert(sig); auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::IShuffleLayer* shuffle_block(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, int inch, int outch, int s) { Weights emptywts{DataType::kFLOAT, nullptr, 0}; int branch_features = outch / 2; ITensor *x1, *x2i, *x2o; if (s > 1) { IConvolutionLayer* conv1 = network->addConvolutionNd(input, inch, DimsHW{3, 3}, weightMap[lname + "branch1.0.weight"], emptywts); assert(conv1); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{1, 1}); conv1->setNbGroups(inch); IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + "branch1.1", 1e-5); IConvolutionLayer* conv2 = network->addConvolutionNd(*bn1->getOutput(0), branch_features, DimsHW{1, 1}, weightMap[lname + "branch1.2.weight"], emptywts); assert(conv2); IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + "branch1.3", 1e-5); IActivationLayer* relu1 = network->addActivation(*bn2->getOutput(0), ActivationType::kRELU); assert(relu1); x1 = relu1->getOutput(0); x2i = &input; } else { Dims d = input.getDimensions(); ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ d.d[0] / 2, d.d[1], d.d[2] }, Dims3{ 1, 1, 1 }); ISliceLayer *s2 = network->addSlice(input, Dims3{ d.d[0] / 2, 0, 0 }, Dims3{ d.d[0] / 2, d.d[1], d.d[2] }, Dims3{ 1, 1, 1 }); x1 = s1->getOutput(0); x2i = s2->getOutput(0); } IConvolutionLayer* conv3 = network->addConvolutionNd(*x2i, branch_features, DimsHW{1, 1}, weightMap[lname + "branch2.0.weight"], emptywts); assert(conv3); IScaleLayer *bn3 = addBatchNorm2d(network, weightMap, *conv3->getOutput(0), lname + "branch2.1", 1e-5); IActivationLayer* relu2 = network->addActivation(*bn3->getOutput(0), ActivationType::kRELU); assert(relu2); IConvolutionLayer* conv4 = network->addConvolutionNd(*relu2->getOutput(0), branch_features, DimsHW{3, 3}, weightMap[lname + "branch2.3.weight"], emptywts); assert(conv4); conv4->setStrideNd(DimsHW{s, s}); conv4->setPaddingNd(DimsHW{1, 1}); conv4->setNbGroups(branch_features); IScaleLayer *bn4 = addBatchNorm2d(network, weightMap, *conv4->getOutput(0), lname + "branch2.4", 1e-5); IConvolutionLayer* conv5 = network->addConvolutionNd(*bn4->getOutput(0), branch_features, DimsHW{1, 1}, weightMap[lname + "branch2.5.weight"], emptywts); assert(conv5); IScaleLayer *bn5 = addBatchNorm2d(network, weightMap, *conv5->getOutput(0), lname + "branch2.6", 1e-5); IActivationLayer* relu3 = network->addActivation(*bn5->getOutput(0), ActivationType::kRELU); assert(relu3); ITensor* inputTensors1[] = {x1, relu3->getOutput(0)}; IConcatenationLayer* cat1 = network->addConcatenation(inputTensors1, 2); assert(cat1); Dims dims = cat1->getOutput(0)->getDimensions(); std::cout << cat1->getOutput(0)->getName() << " dims: "; for (int i = 0; i < dims.nbDims; i++) { std::cout << dims.d[i] << ", "; } std::cout << std::endl; IShuffleLayer *sf1 = network->addShuffle(*cat1->getOutput(0)); assert(sf1); sf1->setReshapeDimensions(Dims4(2, dims.d[0] / 2, dims.d[1], dims.d[2])); sf1->setSecondTranspose(Permutation{1, 0, 2, 3}); Dims dims1 = sf1->getOutput(0)->getDimensions(); std::cout << sf1->getOutput(0)->getName() << " dims: "; for (int i = 0; i < dims1.nbDims; i++) { std::cout << dims1.d[i] << ", "; } std::cout << std::endl; IShuffleLayer *sf2 = network->addShuffle(*sf1->getOutput(0)); assert(sf2); sf2->setReshapeDimensions(Dims3(dims.d[0], dims.d[1], dims.d[2])); Dims dims2 = sf2->getOutput(0)->getDimensions(); std::cout << sf2->getOutput(0)->getName() << " dims: "; for (int i = 0; i < dims2.nbDims; i++) { std::cout << dims2.d[i] << ", "; } std::cout << std::endl; return sf2; } nvinfer1::IElementWiseLayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { int c_ = c1 / 2; auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 }); pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 }); pool1->setStrideNd(DimsHW{ 1, 1 }); auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 }); pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 }); pool2->setStrideNd(DimsHW{ 1, 1 }); auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 }); pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 }); pool3->setStrideNd(DimsHW{ 1, 1 }); ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); return cv2; } nvinfer1::IElementWiseLayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); if (shortcut && c1 == c2) { auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); return ew; } return cv2; } nvinfer1::IElementWiseLayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; int c_ = (int)((float)c2 * e); auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts); ITensor* y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts); ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); lr->setAlpha(0.1); auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); return cv4; } nvinfer1::IElementWiseLayer* C3(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { int c_ = (int)((float)c2 * e); auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2"); ITensor *y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } ITensor* inputTensors[] = { y1, cv2->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 2); auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3"); return cv3; } nvinfer1::IScaleLayer *conv_bn(nvinfer1::INetworkDefinition *network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, int out_channels, int kernel_size, int stride, int padding, int groups=1){ nvinfer1::Weights emptywts{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer *conv = network->addConvolutionNd(input, out_channels, nvinfer1::DimsHW{kernel_size, kernel_size}, weightMap[lname + ".conv.weight"], emptywts); conv->setStrideNd(nvinfer1::DimsHW{stride, stride}); conv->setPaddingNd(nvinfer1::DimsHW{padding, padding}); conv->setNbGroups(groups); nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-5); return bn; } nvinfer1::IActivationLayer *RepVGGBlock(nvinfer1::INetworkDefinition *network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, int out_channels, int kernel_size = 3, int stride = 1, int padding = 1, int groups=1){ nvinfer1::IScaleLayer *rbr_dense = conv_bn(network, weightMap, input, lname + ".rbr_dense", out_channels, kernel_size, stride, padding, groups); int padding_11 = padding - kernel_size / 2; nvinfer1::IScaleLayer *rbr_1x1 = conv_bn(network, weightMap, input, lname + ".rbr_1x1", out_channels, 1, stride, padding_11, groups); nvinfer1::IElementWiseLayer *add = network->addElementWise(*rbr_dense->getOutput(0), *rbr_1x1->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); nvinfer1::IActivationLayer *silu = network->addActivation(*add->getOutput(0), nvinfer1::ActivationType::kRELU); return silu; } nvinfer1::IActivationLayer *DWConvblock(nvinfer1::INetworkDefinition *network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, int in_channels, int out_channels, int kernel_size, int stride){ nvinfer1::Weights emptywts {nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer *conv1 = network->addConvolutionNd(input, in_channels, nvinfer1::DimsHW{kernel_size, kernel_size}, weightMap[lname + ".conv1.weight"], emptywts); conv1->setStrideNd(nvinfer1::DimsHW{stride, stride}); std::cout << (kernel_size / 2) << std::endl; conv1->setPaddingNd(nvinfer1::DimsHW{kernel_size / 2, kernel_size / 2}); conv1->setNbGroups(in_channels); nvinfer1::IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn1", 1e-5); nvinfer1::IActivationLayer *relu1 = network->addActivation(*bn1->getOutput(0), nvinfer1::ActivationType::kRELU); nvinfer1::IConvolutionLayer *conv2 = network->addConvolutionNd(*relu1->getOutput(0), out_channels, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".conv2.weight"], emptywts); conv2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IScaleLayer *bn2 = addBatchNorm2d(network, weightMap, *conv2->getOutput(0), lname + ".bn2", 1e-5); nvinfer1::IActivationLayer *relu2 = network->addActivation(*bn2->getOutput(0), nvinfer1::ActivationType::kRELU); return relu2; } std::vector> getAnchors(std::map& weightMap, std::string lname) { std::vector> anchors; Weights wts = weightMap[lname + ".anchor_grid"]; int anchor_len = Yolo::CHECK_COUNT * 2; // 6 for (int i = 0; i < wts.count / anchor_len; i++) { auto *p = (const float*)wts.values + i * anchor_len; std::vector anchor(p, p + anchor_len); anchors.push_back(anchor); } return anchors; } nvinfer1::IElementWiseLayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); return conv; } nvinfer1::IElementWiseLayer *ADD(nvinfer1::INetworkDefinition *network,nvinfer1::ITensor& x1,nvinfer1::ITensor& x2, float alpha) { nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, &alpha, 1}; nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IScaleLayer* scaleLayer = network->addScale(x2, nvinfer1::ScaleMode::kUNIFORM, shift, scale, power); nvinfer1::IElementWiseLayer* addLayer = network->addElementWise(x1, *scaleLayer->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return addLayer; } IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map& weightMap, std::string lname, std::vector dets) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); auto anchors = getAnchors(weightMap, lname); PluginField plugin_fields[2]; int netinfo[4] = {Yolo::CLASS_NUM, Yolo::INPUT_W, Yolo::INPUT_H, Yolo::MAX_OUTPUT_BBOX_COUNT}; plugin_fields[0].data = netinfo; plugin_fields[0].length = 4; plugin_fields[0].name = "netinfo"; plugin_fields[0].type = PluginFieldType::kFLOAT32; int scale = 8; std::vector kernels; for (size_t i = 0; i < anchors.size(); i++) { Yolo::YoloKernel kernel; kernel.width = Yolo::INPUT_W / scale; kernel.height = Yolo::INPUT_H / scale; memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float)); kernels.push_back(kernel); scale *= 2; } plugin_fields[1].data = &kernels[0]; plugin_fields[1].length = kernels.size(); plugin_fields[1].name = "kernels"; plugin_fields[1].type = PluginFieldType::kFLOAT32; PluginFieldCollection plugin_data; plugin_data.nbFields = 2; plugin_data.fields = plugin_fields; IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data); std::vector input_tensors; for (auto det: dets) { input_tensors.push_back(det->getOutput(0)); } auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); return yolo; } #endif ================================================ FILE: yolov5-lite/gen_wts.py ================================================ import argparse import os import struct import torch from utils.torch_utils import select_device def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument('-o', '--output', help='Output (.wts) file path (optional)') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output pt_file, wts_file = parse_args() # Initialize device = select_device('cpu') # Load model model = torch.load(pt_file, map_location=device, weights_only=False)['model'].float() # load to FP32 model.to(device).eval() with open(wts_file, 'w') as f: # Write the number of keys in the parameter dictionary first f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): # Flatten matrix parameters into a 1D array vr = v.reshape(-1).cpu().numpy() # Key, number of elements in the 1D array f.write('{} {} '.format(k, len(vr))) # Values, each separated by a space for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov5-lite/v5lite.cpp ================================================ #include #include #include #include #include #include "cuda_utils.h" #include "logging.h" #include "common.hpp" #include "utils.h" #include "calibrator.h" // #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #define USE_INT8 // set USE_INT8 or USE_FP16 or USE_FP32 static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 static Logger gLogger; static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) { --r; } return std::max(r, 1); } inline int Get_channel(int x, int gw = 1, float divisor = 8.0){ // std::cout << "=======" << (x*gw) / divisor << "===============" << std::endl; auto ch_out = int(ceil((x * gw) / divisor)) * divisor; return ch_out; } nvinfer1::ICudaEngine *build_det_v5_lite_c(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, std::string wts_name) { nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_W, Yolo::INPUT_H}); std::map weightMap = loadWeights(wts_name); // backbone nvinfer1::IElementWiseLayer *conv0 = CBH(network, weightMap, *data, Get_channel(32), 3, 2, "model.0"); nvinfer1::IElementWiseLayer *conv1 = LC_Block(network, weightMap, *conv0->getOutput(0), Get_channel(32), Get_channel(64), 2, 3, "model.1", false); nvinfer1::IElementWiseLayer *conv2 = LC_Block(network, weightMap, *conv1->getOutput(0), Get_channel(64), Get_channel(64), 1, 3, "model.2", false); nvinfer1::IElementWiseLayer *conv3 = LC_Block(network, weightMap, *conv2->getOutput(0), Get_channel(64), Get_channel(128), 2, 3, "model.3", false); nvinfer1::IElementWiseLayer *conv4 = LC_Block(network, weightMap, *conv3->getOutput(0), Get_channel(128), Get_channel(128), 1, 3, "model.4", false); nvinfer1::IElementWiseLayer *conv5 = LC_Block(network, weightMap, *conv4->getOutput(0), Get_channel(128), Get_channel(128), 1, 3, "model.5", false); nvinfer1::IElementWiseLayer *conv6 = LC_Block(network, weightMap, *conv5->getOutput(0), Get_channel(128), Get_channel(128), 1, 3, "model.6", false); nvinfer1::IElementWiseLayer *conv7 = LC_Block(network, weightMap, *conv6->getOutput(0), Get_channel(128), Get_channel(256), 2, 3, "model.7", false); nvinfer1::IElementWiseLayer *conv8 = LC_Block(network, weightMap, *conv7->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.8", false); nvinfer1::IElementWiseLayer *conv9 = LC_Block(network, weightMap, *conv8->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.9", false); nvinfer1::IElementWiseLayer *conv10 = LC_Block(network, weightMap, *conv9->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.10", false); nvinfer1::IElementWiseLayer *conv11 = LC_Block(network, weightMap, *conv10->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.11", false); nvinfer1::IElementWiseLayer *conv12 = LC_Block(network, weightMap, *conv11->getOutput(0), Get_channel(256), Get_channel(256), 1, 5, "model.12", false); nvinfer1::IElementWiseLayer *conv13 = LC_Block(network, weightMap, *conv12->getOutput(0), Get_channel(256), Get_channel(512), 2, 5, "model.13", true); nvinfer1::IElementWiseLayer *conv14 = LC_Block(network, weightMap, *conv13->getOutput(0), Get_channel(512), Get_channel(512), 1, 5, "model.14", true); nvinfer1::IElementWiseLayer *conv15 = LC_Block(network, weightMap, *conv14->getOutput(0), Get_channel(512), Get_channel(512), 1, 5, "model.15", true); nvinfer1::IElementWiseLayer *conv16 = LC_Block(network, weightMap, *conv15->getOutput(0), Get_channel(512), Get_channel(512), 1, 5, "model.16", true); nvinfer1::IElementWiseLayer *conv17 = Dense(network, weightMap, *conv16->getOutput(0), Get_channel(512), 1, "model.17"); // neck float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IElementWiseLayer *conv18 = convBlock(network, weightMap, *conv17->getOutput(0), Get_channel(256), 1, 1, 1, "model.18"); nvinfer1::IResizeLayer *upsample19 = network->addResize(*conv18->getOutput(0)); upsample19->setScales(scale, 3); nvinfer1::ITensor *inputTensors20[] = {upsample19->getOutput(0), conv12->getOutput(0)}; // 256 + 256 = 512 nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensors20, 2); nvinfer1::IElementWiseLayer *conv21 = C3(network, weightMap, *cat20->getOutput(0), 512, Get_channel(256), get_depth(1, 1), false, 1, 0.5, "model.21"); nvinfer1::IElementWiseLayer *conv22 = convBlock(network, weightMap, *conv21->getOutput(0), Get_channel(128), 1, 1, 1, "model.22"); nvinfer1::IResizeLayer *upsample23 = network->addResize(*conv22->getOutput(0)); upsample23->setScales(scale, 3); nvinfer1::ITensor *inputTensors24[] = {upsample23->getOutput(0), conv6->getOutput(0)}; // 128 + 128 = 256 nvinfer1::IConcatenationLayer *cat24 = network->addConcatenation(inputTensors24, 2); nvinfer1::IElementWiseLayer *conv25 = C3(network, weightMap, *cat24->getOutput(0), 256, Get_channel(128), get_depth(1, 1), false, 1, 0.5, "model.25"); nvinfer1::IElementWiseLayer *conv26 = LC_Block(network, weightMap, *conv25->getOutput(0), Get_channel(128), Get_channel(128), 2, 5, "model.26", true); nvinfer1::ITensor *inputTensor27[] = {conv26->getOutput(0), conv22->getOutput(0)}; // 128 + 128 = 256 nvinfer1::IConcatenationLayer *cat27 = network->addConcatenation(inputTensor27, 2); nvinfer1::IElementWiseLayer *conv28 = C3(network, weightMap, *cat27->getOutput(0), 256, Get_channel(256), get_depth(1, 1), false, 1, 0.5, "model.28"); nvinfer1::IElementWiseLayer *conv29 = LC_Block(network, weightMap, *conv28->getOutput(0), Get_channel(256), Get_channel(256), 2, 5, "model.29", true); nvinfer1::ITensor *inputTensor30[] = {conv29->getOutput(0), conv18->getOutput(0)}; // 256 + 256 = 512 nvinfer1::IConcatenationLayer *cat30 = network->addConcatenation(inputTensor30, 2); nvinfer1::IElementWiseLayer *conv31 = C3(network, weightMap, *cat30->getOutput(0), 512, Get_channel(512), get_depth(1, 1), false, 1, 0.5, "model.31"); // detect nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.32.m.0.weight"], weightMap["model.32.m.0.bias"]); nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv28->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.32.m.1.weight"], weightMap["model.32.m.1.bias"]); nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv31->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.32.m.2.weight"], weightMap["model.32.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.32", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); std::string data_path = "tensorrtx-int8calib-data/coco_calib/"; //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } nvinfer1::ICudaEngine *build_det_v5_lite_e(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, std::string wts_name){ nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_W, Yolo::INPUT_H}); std::map weightMap = loadWeights(wts_name); // backbone nvinfer1::IPoolingLayer *conv0 = conv_bn_relu_maxpool(network, weightMap, *data, 32, "model.0."); //32 // std::cout << "Get_channel: " << Get_channel(116) << std::endl; nvinfer1::IShuffleLayer *conv1 = shuffle_block(network, weightMap, *conv0->getOutput(0), "model.1.", 32, Get_channel(116), 2); //120 nvinfer1::IShuffleLayer *conv2_0 = shuffle_block(network, weightMap, *conv1->getOutput(0), "model.2.0.", Get_channel(116), Get_channel(116), 1); //120 nvinfer1::IShuffleLayer *conv2_1 = shuffle_block(network, weightMap, *conv2_0->getOutput(0), "model.2.1.", Get_channel(116), Get_channel(116), 1); // 120 nvinfer1::IShuffleLayer *conv2_2 = shuffle_block(network, weightMap, *conv2_1->getOutput(0), "model.2.2.", Get_channel(116), Get_channel(116), 1); // 120 nvinfer1::IShuffleLayer *conv3 = shuffle_block(network, weightMap, *conv2_2->getOutput(0), "model.3.", Get_channel(116), Get_channel(232), 2); // 232 nvinfer1::IShuffleLayer *conv4_0 = shuffle_block(network, weightMap, *conv3->getOutput(0), "model.4.0.", Get_channel(232), Get_channel(232), 1); // 232 nvinfer1::IShuffleLayer *conv4_1 = shuffle_block(network, weightMap, *conv4_0->getOutput(0), "model.4.1.", Get_channel(232), Get_channel(232), 1); // 232 nvinfer1::IShuffleLayer *conv4_2 = shuffle_block(network, weightMap, *conv4_1->getOutput(0), "model.4.2.", Get_channel(232), Get_channel(232), 1); // 232 nvinfer1::IShuffleLayer *conv4_3 = shuffle_block(network, weightMap, *conv4_2->getOutput(0), "model.4.3.", Get_channel(232), Get_channel(232), 1); // 232 nvinfer1::IShuffleLayer *conv4_4 = shuffle_block(network, weightMap, *conv4_3->getOutput(0), "model.4.4.", Get_channel(232), Get_channel(232), 1); //232 nvinfer1::IShuffleLayer *conv4_5 = shuffle_block(network, weightMap, *conv4_4->getOutput(0), "model.4.5.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_6 = shuffle_block(network, weightMap, *conv4_5->getOutput(0), "model.4.6.", Get_channel(232), Get_channel(232), 1); // 232 nvinfer1::IShuffleLayer *conv5 = shuffle_block(network, weightMap, *conv4_6->getOutput(0), "model.5.", Get_channel(232), Get_channel(464), 2); //464 nvinfer1::IShuffleLayer *conv6 = shuffle_block(network, weightMap, *conv5->getOutput(0), "model.6.", Get_channel(464), Get_channel(464), 1); // 464 // neck float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IElementWiseLayer *conv7 = convBlock(network, weightMap, *conv6->getOutput(0), Get_channel(96), 1, 1, 1, "model.7"); // 96 nvinfer1::IResizeLayer *upsample8 = network->addResize(*conv7->getOutput(0)); upsample8->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample8->setScales(scale, 3); nvinfer1::ITensor *inputTensors9[] = {upsample8->getOutput(0), conv4_6->getOutput(0)}; nvinfer1::IConcatenationLayer *cat9 = network->addConcatenation(inputTensors9, 2); // 96 + 232 = 328 nvinfer1::IActivationLayer *conv10 = DWConvblock(network, weightMap, *cat9->getOutput(0), "model.10", 328, Get_channel(96), 3, 1); nvinfer1::IElementWiseLayer *conv11 = convBlock(network, weightMap, *conv10->getOutput(0), Get_channel(96), 1, 1, 1, "model.11"); // 96 nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0)); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 3); nvinfer1::ITensor *inputTensors13[] = {upsample12->getOutput(0), conv2_2->getOutput(0)}; // 96 + 120 nvinfer1::IConcatenationLayer *cat13 = network->addConcatenation(inputTensors13, 2); nvinfer1::IActivationLayer *conv14 = DWConvblock(network, weightMap, *cat13->getOutput(0), "model.14", 216, Get_channel(96), 3, 1); nvinfer1::IActivationLayer *conv15 = DWConvblock(network, weightMap, *conv14->getOutput(0), "model.15", Get_channel(96), Get_channel(96), 3, 2); nvinfer1::IElementWiseLayer *add16 = ADD(network, *conv15->getOutput(0), *conv11->getOutput(0), 1.0); nvinfer1::IActivationLayer *conv17 = DWConvblock(network, weightMap, *add16->getOutput(0), "model.17", Get_channel(96), Get_channel(96), 3, 1); nvinfer1::IActivationLayer *conv18 = DWConvblock(network, weightMap, *conv17->getOutput(0), "model.18", Get_channel(96), Get_channel(96), 3, 2); nvinfer1::IElementWiseLayer *add19 = ADD(network, *conv18->getOutput(0), *conv7->getOutput(0), 1.0); nvinfer1::IActivationLayer *conv20 = DWConvblock(network, weightMap, *add19->getOutput(0), "model.20", Get_channel(96), Get_channel(96), 3, 1); // detect nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv14->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.0.weight"], weightMap["model.21.m.0.bias"]); nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.1.weight"], weightMap["model.21.m.1.bias"]); nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.2.weight"], weightMap["model.21.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.21", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); std::string data_path = "tensorrtx-int8calib-data/coco_calib/"; //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } nvinfer1::ICudaEngine *build_det_v5_lite_g(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, std::string wts_name){ nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); // backbone nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_H, Yolo::INPUT_W}); assert(data); std::map weightMap = loadWeights(wts_name); nvinfer1::IElementWiseLayer *conv0 = focus(network, weightMap, *data, 3, Get_channel(32), 3, "model.0"); // 32 nvinfer1::IActivationLayer *conv1 = RepVGGBlock(network, weightMap, *conv0->getOutput(0), "model.1", Get_channel(64), 3, 2, 1); //64 nvinfer1::IElementWiseLayer *conv2 = C3(network, weightMap, *conv1->getOutput(0), Get_channel(64), Get_channel(64), get_depth(1, 1), true, 1, 0.5, "model.2"); // 64 nvinfer1::IActivationLayer *conv3 = RepVGGBlock(network, weightMap, *conv2->getOutput(0), "model.3", Get_channel(128), 3, 2, 1); // 128 nvinfer1::IElementWiseLayer *conv4 = C3(network, weightMap, *conv3->getOutput(0), Get_channel(128), Get_channel(128), get_depth(3, 1), true, 1, 0.5, "model.4"); // 128 nvinfer1::IActivationLayer *conv5 = RepVGGBlock(network, weightMap, *conv4->getOutput(0), "model.5", Get_channel(256), 3, 2, 1); // 256 nvinfer1::IElementWiseLayer *conv6 = C3(network, weightMap, *conv5->getOutput(0), Get_channel(256), Get_channel(256), get_depth(3, 1), true, 1, 0.5, "model.6"); // 256 nvinfer1::IActivationLayer *conv7 = RepVGGBlock(network, weightMap, *conv6->getOutput(0), "model.7", Get_channel(512), 3, 2, 1); // 512 nvinfer1::IElementWiseLayer *conv8 = SPP(network, weightMap, *conv7->getOutput(0), Get_channel(512), Get_channel(512), 5, 9, 13, "model.8"); // 512 nvinfer1::IElementWiseLayer *conv9 = C3(network, weightMap, *conv8->getOutput(0), Get_channel(512), Get_channel(512), get_depth(1, 1), false, 1, 0.5, "model.9"); // 512 float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IElementWiseLayer *conv10 = convBlock(network, weightMap, *conv9->getOutput(0), Get_channel(128), 1, 1, 1, "model.10"); // 128 nvinfer1::IResizeLayer *upsample11 = network->addResize(*conv10->getOutput(0)); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setScales(scale, 3); nvinfer1::ITensor *inputTensors12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer *cat12 = network->addConcatenation(inputTensors12, 2); // 384 nvinfer1::IElementWiseLayer *conv13 = C3(network, weightMap, *cat12->getOutput(0), 384, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.13"); nvinfer1::IElementWiseLayer *conv14 = convBlock(network, weightMap, *conv13->getOutput(0), Get_channel(128), 1, 1, 1, "model.14"); // 128 nvinfer1::IResizeLayer *upsample15 = network->addResize(*conv14->getOutput(0)); upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample15->setScales(scale, 3); nvinfer1::ITensor *inputTensors16[] = {upsample15->getOutput(0), conv4->getOutput(0)}; // 128+128 nvinfer1::IConcatenationLayer *cat16 = network->addConcatenation(inputTensors16, 2); nvinfer1::IElementWiseLayer *conv17 = C3(network, weightMap, *cat16->getOutput(0), 256, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.17"); nvinfer1::IElementWiseLayer *conv18 = convBlock(network, weightMap, *conv17->getOutput(0), Get_channel(128), 3, 2, 1, "model.18"); // 128 nvinfer1::ITensor *inputTensors19[] = {conv18->getOutput(0), conv14->getOutput(0)}; nvinfer1::IConcatenationLayer *cat19 = network->addConcatenation(inputTensors19, 2); // 128 + 128 nvinfer1::IElementWiseLayer *conv20 = C3(network, weightMap, *cat19->getOutput(0), 256, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.20"); nvinfer1::IElementWiseLayer *conv21 = convBlock(network, weightMap, *conv20->getOutput(0), Get_channel(128), 3, 2, 1, "model.21"); // 128 nvinfer1::ITensor *inputTensors22[] = {conv21->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer *cat22 = network->addConcatenation(inputTensors22, 2); // 128 + 128 nvinfer1::IElementWiseLayer *conv23 = C3(network, weightMap, *cat22->getOutput(0), 256, Get_channel(128), get_depth(3, 1), false, 1, 0.5, "model.23"); // detect nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); std::string data_path = "tensorrtx-int8calib-data/coco_calib/"; //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } nvinfer1::ICudaEngine *build_det_v5_lite_s(unsigned int maxBatchSize, nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt,std::string & wts_name){ // backbone nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); nvinfer1::ITensor *data = network->addInput(Yolo::INPUT_BLOB_NAME, dt, nvinfer1::Dims3{3, Yolo::INPUT_H, Yolo::INPUT_W}); assert(data); std::map weightMap = loadWeights(wts_name); nvinfer1::IPoolingLayer *conv0 = conv_bn_relu_maxpool(network, weightMap, *data, 32, "model.0."); std::cout << "Get_channel: " << Get_channel(116) << std::endl; nvinfer1::IShuffleLayer *conv1 = shuffle_block(network, weightMap, *conv0->getOutput(0), "model.1.", 32, Get_channel(116), 2); nvinfer1::IShuffleLayer *conv2_0 = shuffle_block(network, weightMap, *conv1->getOutput(0), "model.2.0.", Get_channel(116), Get_channel(116), 1); nvinfer1::IShuffleLayer *conv2_1 = shuffle_block(network, weightMap, *conv2_0->getOutput(0), "model.2.1.", Get_channel(116), Get_channel(116), 1); nvinfer1::IShuffleLayer *conv2_2 = shuffle_block(network, weightMap, *conv2_1->getOutput(0), "model.2.2.", Get_channel(116), Get_channel(116), 1); nvinfer1::IShuffleLayer *conv3 = shuffle_block(network, weightMap, *conv2_2->getOutput(0), "model.3.", Get_channel(116), Get_channel(232), 2); nvinfer1::IShuffleLayer *conv4_0 = shuffle_block(network, weightMap, *conv3->getOutput(0), "model.4.0.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_1 = shuffle_block(network, weightMap, *conv4_0->getOutput(0), "model.4.1.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_2 = shuffle_block(network, weightMap, *conv4_1->getOutput(0), "model.4.2.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_3 = shuffle_block(network, weightMap, *conv4_2->getOutput(0), "model.4.3.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_4 = shuffle_block(network, weightMap, *conv4_3->getOutput(0), "model.4.4.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_5 = shuffle_block(network, weightMap, *conv4_4->getOutput(0), "model.4.5.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv4_6 = shuffle_block(network, weightMap, *conv4_5->getOutput(0), "model.4.6.", Get_channel(232), Get_channel(232), 1); nvinfer1::IShuffleLayer *conv5 = shuffle_block(network, weightMap, *conv4_6->getOutput(0), "model.5.", Get_channel(232), Get_channel(464), 2); nvinfer1::IShuffleLayer *conv6_0 = shuffle_block(network, weightMap, *conv5->getOutput(0), "model.6.0.", Get_channel(464), Get_channel(464), 1); nvinfer1::IShuffleLayer *conv6_1 = shuffle_block(network, weightMap, *conv6_0->getOutput(0), "model.6.1.", Get_channel(464), Get_channel(464), 1); nvinfer1::IShuffleLayer *conv6_2 = shuffle_block(network, weightMap, *conv6_1->getOutput(0), "model.6.2.", Get_channel(464), Get_channel(464), 1); // head float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IElementWiseLayer *conv7 = convBlock(network, weightMap, *conv6_2->getOutput(0), Get_channel(128), 1, 1, 1, "model.7"); nvinfer1::IResizeLayer *upsample8 = network->addResize(*conv7->getOutput(0)); upsample8->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample8->setScales(scale, 3); assert(upsample8); nvinfer1::ITensor *inputTensors9[] = {upsample8->getOutput(0), conv4_6->getOutput(0)}; // channels = 128 + 232 = 360 nvinfer1::IConcatenationLayer *cat9 = network->addConcatenation(inputTensors9, 2); // std::cout << "The c3 's n is " << get_depth(3, 1) << std::endl; nvinfer1::IElementWiseLayer *conv10 = C3(network, weightMap, *cat9->getOutput(0), 360, Get_channel(128), get_depth(1, 1), false, 1, 0.5, "model.10"); nvinfer1::IElementWiseLayer *conv11 = convBlock(network, weightMap, *conv10->getOutput(0), Get_channel(64), 1, 1, 1, "model.11"); nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0)); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 3); assert(upsample12); nvinfer1::ITensor *inputTensors13[] = {upsample12->getOutput(0), conv2_2->getOutput(0)}; // 64 + 120 = 184 nvinfer1::IConcatenationLayer *cat13 = network->addConcatenation(inputTensors13, 2); nvinfer1::IElementWiseLayer *conv14 = C3(network, weightMap, *cat13->getOutput(0), 184, Get_channel(64), get_depth(1, 1), false, 1, 0.5, "model.14"); nvinfer1::IElementWiseLayer *conv15 = convBlock(network, weightMap, *conv14->getOutput(0), Get_channel(64), 3, 2, 1, "model.15"); nvinfer1::ITensor *inputTensors16[] = {conv15->getOutput(0), conv11->getOutput(0)}; // 64 + 64 = 128 nvinfer1::IConcatenationLayer *cat16 = network->addConcatenation(inputTensors16, 2); nvinfer1::IElementWiseLayer *conv17 = C3(network, weightMap, *cat16->getOutput(0), 128, Get_channel(128), get_depth(1, 1), false, 1, 0.5, "model.17"); nvinfer1::IElementWiseLayer *conv18 = convBlock(network, weightMap, *conv17->getOutput(0), Get_channel(128), 3, 2, 1, "model.18"); nvinfer1::ITensor *inputTensors19[] = {conv18->getOutput(0), conv7->getOutput(0)}; // 128 + 128 = 256 nvinfer1::IConcatenationLayer *cat19 = network->addConcatenation(inputTensors19, 2); nvinfer1::IElementWiseLayer *conv20 = C3(network, weightMap, *cat19->getOutput(0), 256, Get_channel(256), get_depth(1, 1), false, 1, 0.5, "model.20"); // detect nvinfer1::IConvolutionLayer *det0 = network->addConvolutionNd(*conv14->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.0.weight"], weightMap["model.21.m.0.bias"]); nvinfer1::IConvolutionLayer *det1 = network->addConvolutionNd(*conv17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.1.weight"], weightMap["model.21.m.1.bias"]); nvinfer1::IConvolutionLayer *det2 = network->addConvolutionNd(*conv20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), nvinfer1::DimsHW{1, 1}, weightMap["model.21.m.2.weight"], weightMap["model.21.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.21", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(Yolo::OUTPUT_BLOB_NAME); network->markOutput(*yolo->getOutput(0)); // Engine config builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); std::string data_path = "tensorrtx-int8calib-data/coco_calib/"; //Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, Yolo::INPUT_W, Yolo::INPUT_H, data_path.c_str(), "int8calib.table", Yolo::INPUT_BLOB_NAME); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more network->destroy(); // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return engine; } void serialize_engine(unsigned int max_batchsize, std::string& wts_name, std::string& engine_name, std::string & used_model){ IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); ICudaEngine *engine = nullptr; if(used_model == "g"){ engine = build_det_v5_lite_g(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name); }else if(used_model == "s"){ engine = build_det_v5_lite_s(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name); }else if(used_model == "c"){ engine = build_det_v5_lite_c(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name); } else{ engine = build_det_v5_lite_e(max_batchsize, builder, config, nvinfer1::DataType::kFLOAT, wts_name); } // Serialize the engine IHostMemory* serialized_engine = engine->serialize(); assert(serialized_engine != nullptr); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; // assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down engine->destroy(); config->destroy(); serialized_engine->destroy(); builder->destroy(); } void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) { // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * Yolo::INPUT_H * Yolo::INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } bool parse_args(int argc, char **argv, std::string & wts_name, std::string & engine_name, std::string & used_model, std::string & img_dir){ if(argc < 4 || argc > 6) return false; if(std::string(argv[1]) == "-s" && (argc == 5)){ wts_name = argv[2]; engine_name = argv[3]; used_model = argv[4]; }else if(std::string(argv[1]) == "-d" && argc == 4){ engine_name = std::string(argv[2]); img_dir = std::string(argv[3]); }else{ return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(Yolo::DEVICE); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir, used_model; if(!parse_args(argc, argv, wts_name, engine_name, used_model, img_dir)){ std::cerr << "arguments not right!" << std::endl; std::cerr << "./v5lite -s [.wts] [.engine] [s/e/g/c] // serialize modeo to the plan" << std::endl; std::cerr << "./v5lite -d [.engine] ../images // deserialize plan file and run inference" << std::endl; return -1; } if (!wts_name.empty()) { serialize_engine(Yolo::BATCH_SIZE, wts_name, engine_name, used_model); return 0; } // deserialize the .engine and run inference std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; return -1; } char *trtModelStream = nullptr; size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // prepare input data --------------------------- static float data[Yolo::BATCH_SIZE * 3 * Yolo::INPUT_H * Yolo::INPUT_W]; //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) // data[i] = 1.0; static float prob[Yolo::BATCH_SIZE * OUTPUT_SIZE]; IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; assert(engine->getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(Yolo::INPUT_BLOB_NAME); const int outputIndex = engine->getBindingIndex(Yolo::OUTPUT_BLOB_NAME); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc(&buffers[inputIndex], Yolo::BATCH_SIZE * 3 * Yolo::INPUT_H * Yolo::INPUT_W * sizeof(float))); CUDA_CHECK(cudaMalloc(&buffers[outputIndex], Yolo::BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); // Create stream cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); int fcount = 0; for (int f = 0; f < (int)file_names.size(); f++) { fcount++; if (fcount < Yolo::BATCH_SIZE && f + 1 != (int)file_names.size()) continue; for (int b = 0; b < fcount; b++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]); if (img.empty()) continue; cv::Mat pr_img = preprocess_img(img, Yolo::INPUT_W, Yolo::INPUT_H); // letterbox BGR to RGB int i = 0; for (int row = 0; row < Yolo::INPUT_H; ++row) { uchar* uc_pixel = pr_img.data + row * pr_img.step; for (int col = 0; col < Yolo::INPUT_W; ++col) { data[b * 3 * Yolo::INPUT_H * Yolo::INPUT_W + i] = (float)uc_pixel[2] / 255.0; data[b * 3 * Yolo::INPUT_H * Yolo::INPUT_W + i + Yolo::INPUT_H * Yolo::INPUT_W] = (float)uc_pixel[1] / 255.0; data[b * 3 * Yolo::INPUT_H * Yolo::INPUT_W + i + 2 * Yolo::INPUT_H * Yolo::INPUT_W] = (float)uc_pixel[0] / 255.0; uc_pixel += 3; ++i; } } } // Run inference auto start = std::chrono::system_clock::now(); doInference(*context, stream, buffers, data, prob, Yolo::BATCH_SIZE); auto end = std::chrono::system_clock::now(); std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; std::vector> batch_res(fcount); for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; nms(res, &prob[b * OUTPUT_SIZE], Yolo::CONF_THRESH, Yolo::NMS_THRESH); } for (int b = 0; b < fcount; b++) { auto& res = batch_res[b]; //std::cout << res.size() << std::endl; cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]); for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } cv::imwrite(file_names[f - fcount + 1 + b], img); } fcount = 0; } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(buffers[inputIndex])); CUDA_CHECK(cudaFree(buffers[outputIndex])); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < OUTPUT_SIZE; i++) // { // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; // } // std::cout << std::endl; return 0; } ================================================ FILE: yolov5-lite/yololayer.cu ================================================ #include #include #include #include "yololayer.h" #include "cuda_utils.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } using namespace Yolo; namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel) { mClassCount = classCount; mYoloV5NetWidth = netWidth; mYoloV5NetHeight = netHeight; mMaxOutObject = maxOut; mYoloKernel = vYoloKernel; mKernelCount = vYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaFree(mAnchor[ii])); } CUDA_CHECK(cudaFreeHost(mAnchor)); } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); read(d, mYoloV5NetWidth); read(d, mYoloV5NetHeight); read(d, mMaxOutObject); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(mYoloKernel.data(), d, kernelSize); d += kernelSize; CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); write(d, mYoloV5NetWidth); write(d, mYoloV5NetHeight); write(d, mMaxOutObject); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(d, mYoloKernel.data(), kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT { } // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT { } // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int noElements, const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; int bnIdx = idx / total_grid; idx = idx - total_grid * bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); for (int k = 0; k < CHECK_COUNT; ++k) { float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (box_prob < IGNORE_THRESH) continue; int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float *res_count = output + bnIdx * outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= maxoutobject) return; char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection); Detection *det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; //Location // pytorch: // y = x[i].sigmoid() // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh // X: (sigmoid(tx) + cx)/FeaturemapW * netwidth det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth; det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight; // W: (Pw * e^tw) / FeaturemapW * netwidth // v5: https://github.com/ultralytics/yolov5/issues/471 det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k]; det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1]; det->conf = box_prob * max_cls_prob; det->class_id = class_id; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; for (unsigned int i = 0; i < mYoloKernel.size(); ++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width * yolo.height * batchSize; if (numElem < mThreadCount) mThreadCount = numElem; //printf("Net: %d %d \n", mYoloV5NetWidth, mYoloV5NetHeight); CalDetection << < (numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> > (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem); } } int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 2); assert(strcmp(fc->fields[0].name, "netinfo") == 0); assert(strcmp(fc->fields[1].name, "kernels") == 0); int *p_netinfo = (int*)(fc->fields[0].data); int class_count = p_netinfo[0]; int input_w = p_netinfo[1]; int input_h = p_netinfo[2]; int max_output_object_count = p_netinfo[3]; std::vector kernels(fc->fields[1].length); memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(Yolo::YoloKernel)); YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, kernels); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } ================================================ FILE: yolov5-lite/yolov5-lite-trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 # categories = ['faster'] categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov5 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] # print("class:", categories[int(result_classid[j])]) # print("probability:", result_scores[j]) plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 6))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) print("The lengh of result_boxes is ", len(result_boxes)) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov5_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', "e_" + filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "build/v5lite-g-int8.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # categories = ['faster'] categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) try: print('batch size is', yolov5_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov5_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov5_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov5_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov5_wrapper.destroy() ================================================ FILE: yolov7/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov7) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/home/nvidia/TensorRT-8.2.5.1/include) link_directories(/home/nvidia/TensorRT-8.2.5.1/lib) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolov7 main.cpp ${SRCS}) target_link_libraries(yolov7 nvinfer) target_link_libraries(yolov7 cudart) target_link_libraries(yolov7 myplugins) target_link_libraries(yolov7 ${OpenCV_LIBS}) ================================================ FILE: yolov7/README.md ================================================ # YOLOv7 The Pytorch implementation is [WongKinYiu/yolov7](https://github.com/WongKinYiu/yolov7). The tensorrt code is derived from [QIANXUNZDL123/tensorrtx-yolov7](https://github.com/QIANXUNZDL123/tensorrtx-yolov7) ## Contributors ## Requirements - TensorRT 8.0+ - OpenCV 3.4.0+ ## Different versions of yolov7 Currently, we support yolov7 v0.1 - For yolov7 v0.1, download .pt from [yolov7 release v0.1](https://github.com/WongKinYiu/yolov7/releases/tag/v0.1), then follow how-to-run in current page. ## Config - Choose the model tiny/v7/x/d6/w6/e6/e6e from command line arguments. - Check more configs in [include/config.h](./include/config.h) ## How to Run, yolov7-tiny as example 1. generate .wts from pytorch with .pt, or download .wts from model zoo ``` // download https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-tiny.pt cp {tensorrtx}/yolov7/gen_wts.py {WongKinYiu}/yolov7 cd {WongKinYiu}/yolov7 python gen_wts.py // a file 'yolov7.wts' will be generated. ``` 2. build tensorrtx/yolov7 and run ``` cd {tensorrtx}/yolov7/ // update kNumClass in config.h if your model is trained on custom dataset mkdir build cd build cp {WongKinYiu}/yolov7/yolov7.wts {tensorrtx}/yolov7/build cmake .. make sudo ./yolov7 -s [.wts] [.engine] [t/v7/x/w6/e6/d6/e6e] // serialize model to plan file sudo ./yolov7 -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov7 sudo ./yolov7 -s yolov7.wts yolov7.engine v7 sudo ./yolov7 -d yolov7.engine ../images ``` 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 4. optional, load and run the tensorrt model in python ``` // install python-tensorrt, pycuda, etc. // ensure the yolov7.engine and libmyplugins.so have been built python yolov7_trt.py ``` # INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolov7/build 3. set the macro `USE_INT8` in config.h and make 4. serialize the model and test

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov7/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch from utils.torch_utils import select_device def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument('-o', '--output', help='Output (.wts) file path (optional)') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output pt_file, wts_file = parse_args() # Initialize device = select_device('cpu') # Load model model = torch.load(pt_file, map_location=device, weights_only=False) # Load FP32 weights model = model['ema' if model.get('ema') else 'model'].float() # update anchor_grid info anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] # model.model[-1].anchor_grid = anchor_grid delattr(model.model[-1], 'anchor_grid') # model.model[-1] is detect layer # The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight. model.model[-1].register_buffer("anchor_grid", anchor_grid) model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov7/include/block.h ================================================ #pragma once #include "NvInfer.h" #include #include #include std::map loadWeights(const std::string file); nvinfer1::IElementWiseLayer* convBnSilu(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, int p, std::string lname); nvinfer1::ILayer* ReOrg(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int inch); nvinfer1::ILayer* DownC(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, const std::string& lname); nvinfer1::IElementWiseLayer* SPPCSPC(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, const std::string& lname); nvinfer1::IElementWiseLayer* RepConv(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, const std::string& lname); nvinfer1::IActivationLayer* convBlockLeakRelu(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int outch, int ksize, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition *network, std::map& weightMap, std::string lname, std::vector dets); ================================================ FILE: yolov7/include/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolov7/include/config.h ================================================ #pragma once /* -------------------------------------------------------- * These configs are related to tensorrt model, if these are changed, * please re-compile and re-serialize the tensorrt model. * --------------------------------------------------------*/ // For INT8, you need prepare the calibration dataset, please refer to // https://github.com/wang-xinyu/tensorrtx/tree/master/yolov7#int8-quantization #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 // These are used to define input/output tensor names, // you can set them to whatever you want. const static char* kInputTensorName = "data"; const static char* kOutputTensorName = "prob"; const static int kNumClass = 80; const static int kBatchSize = 1; // Yolo's input width and height must by divisible by 32 const static int kInputH = 640; const static int kInputW = 640; // Maximum number of output bounding boxes from yololayer plugin. // That is maximum number of output bounding boxes before NMS. const static int kMaxNumOutputBbox = 1000; const static int kNumAnchor = 3; // The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin. const static float kIgnoreThresh = 0.1f; /* -------------------------------------------------------- * These configs are not related to tensorrt model, if these are changed, * please re-compile, but no need to re-serialize the tensorrt model. * --------------------------------------------------------*/ // NMS overlapping thresh and final detection confidence thresh const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static int kGpuId = 0; // If your image size is larger than 4096 * 3112, please increase this value const static int kMaxInputImageSize = 4096 * 3112; ================================================ FILE: yolov7/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov7/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov7/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov7/include/model.h ================================================ #pragma once #include "NvInfer.h" #include nvinfer1::IHostMemory* build_engine_yolov7e6e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); nvinfer1::IHostMemory* build_engine_yolov7d6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); nvinfer1::IHostMemory* build_engine_yolov7e6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); nvinfer1::IHostMemory* build_engine_yolov7w6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); nvinfer1::IHostMemory* build_engine_yolov7x(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); nvinfer1::IHostMemory* build_engine_yolov7(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); nvinfer1::IHostMemory* build_engine_yolov7_tiny(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); ================================================ FILE: yolov7/include/postprocess.h ================================================ #pragma once #include "types.h" #include cv::Rect get_rect(cv::Mat& img, float bbox[4]); void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void draw_bbox(std::vector& img_batch, std::vector>& res_batch); ================================================ FILE: yolov7/include/preprocess.h ================================================ #pragma once #include #include #include #include void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov7/include/types.h ================================================ #pragma once #include "config.h" struct YoloKernel { int width; int height; float anchors[kNumAnchor * 2]; }; struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; }; ================================================ FILE: yolov7/include/utils.h ================================================ #ifndef TRTX_YOLOV7_UTILS_H_ #define TRTX_YOLOV7_UTILS_H_ #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols*1.0); float r_h = input_h / (img.rows*1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { DIR *p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } #endif // TRTX_YOLOV7_UTILS_H_ ================================================ FILE: yolov7/main.cpp ================================================ #include "config.h" #include "model.h" #include "cuda_utils.h" #include "logging.h" #include "utils.h" #include "preprocess.h" #include "postprocess.h" #include #include using namespace nvinfer1; const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; static Logger gLogger; void serialize_engine(unsigned int maxBatchSize, std::string& wts_name, std::string& sub_type, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine IHostMemory* serialized_engine = nullptr; if (sub_type == "t") { serialized_engine = build_engine_yolov7_tiny(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "v7") { serialized_engine = build_engine_yolov7(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "x") { serialized_engine = build_engine_yolov7x(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "w6") { serialized_engine = build_engine_yolov7w6(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "e6") { serialized_engine = build_engine_yolov7e6(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "d6") { serialized_engine = build_engine_yolov7d6(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "e6e") { serialized_engine = build_engine_yolov7e6e(maxBatchSize, builder, config, DataType::kFLOAT, wts_name); } assert(serialized_engine != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete config; delete serialized_engine; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchSize) { // infer on the batch asynchronously, and DMA output back to host context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& sub_type) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); sub_type = std::string(argv[4]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string sub_type = ""; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov7 -s [.wts] [.engine] [t/v7/x/w6/e6/d6/e6e] // serialize model to plan file" << std::endl; std::cerr << "./yolov7 -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, wts_name, sub_type, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // NMS std::vector> res_batch; batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov7/plugin/yololayer.cu ================================================ #include "yololayer.h" #include "cuda_utils.h" #include #include #include namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel) { mClassCount = classCount; mYoloV7NetWidth = netWidth; mYoloV7NetHeight = netHeight; mMaxOutObject = maxOut; mYoloKernel = vYoloKernel; mKernelCount = vYoloKernel.size(); CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float) * kNumAnchor * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } } YoloLayerPlugin::~YoloLayerPlugin() { for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaFree(mAnchor[ii])); } CUDA_CHECK(cudaFreeHost(mAnchor)); } // create the plugin at runtime from a byte stream YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mKernelCount); read(d, mYoloV7NetWidth); read(d, mYoloV7NetHeight); read(d, mMaxOutObject); mYoloKernel.resize(mKernelCount); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(mYoloKernel.data(), d, kernelSize); d += kernelSize; CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); size_t AnchorLen = sizeof(float) * kNumAnchor * 2; for (int ii = 0; ii < mKernelCount; ii++) { CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); const auto& yolo = mYoloKernel[ii]; CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); } assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mKernelCount); write(d, mYoloV7NetWidth); write(d, mYoloV7NetHeight); write(d, mMaxOutObject); auto kernelSize = mKernelCount * sizeof(YoloKernel); memcpy(d, mYoloKernel.data(), kernelSize); d += kernelSize; assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(YoloKernel) * mYoloKernel.size() + sizeof(mYoloV7NetWidth) + sizeof(mYoloV7NetHeight) + sizeof(mMaxOutObject); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT { //output the result to channel int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float); return Dims3(totalsize + 1, 1, 1); } // Set plugin namespace void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } // Return the DataType of the plugin output at the requested index DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return DataType::kFLOAT; } // Return true if output tensor is broadcast across a batch. bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } // Return true if plugin can use input that is broadcast across batch without replication. bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(PluginTensorDesc const* in, int nbInput, PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {} // Attach the plugin object to an execution context and grant the plugin the access to some context resource. void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {} // Detach the plugin object from its execution context. void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } // Clone the plugin IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV7NetWidth, mYoloV7NetHeight, mMaxOutObject, mYoloKernel); p->setPluginNamespace(mPluginNamespace); return p; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float *input, float *output, int noElements, const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[kNumAnchor * 2], int classes, int outputElem) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= noElements) return; int total_grid = yoloWidth * yoloHeight; // 80*80 40*40 20*20 int bnIdx = idx / total_grid; idx = idx - total_grid * bnIdx; int info_len_i = 5 + classes; const float* curInput = input + bnIdx * (info_len_i * total_grid * kNumAnchor); for (int k = 0; k < 3; k++) { float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); if (box_prob < kIgnoreThresh) continue; int class_id = 0; float max_cls_prob = 0.0; for (int i = 5; i < info_len_i; ++i) { float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 5; } } float *res_count = output + bnIdx * outputElem; int count = (int)atomicAdd(res_count, 1); if (count >= maxoutobject) return; char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection); Detection *det = (Detection*)(data); int row = idx / yoloWidth; int col = idx % yoloWidth; // Location // pytorch: // y = x[i].sigmoid() // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh // X: (sigmoid(tx) + cx)/FeaturemapW * netwidth det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth; det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight; // W: (Pw * e^tw) / FeaturemapW * netwidth // v5: https://github.com/ultralytics/yolov7/issues/471 //float box_w = ((row[2] * 2)*(row[2] * 2)) * float(anchors[a][c][0]) * scale; //float box_h = ((row[3] * 2) * (row[3] * 2)) * float(anchors[a][c][1]) * scale; det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k]; det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1]; det->conf = box_prob * max_cls_prob; det->class_id = class_id; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; for (unsigned int i = 0; i < mYoloKernel.size(); ++i) { const auto& yolo = mYoloKernel[i]; numElem = yolo.width * yolo.height * batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>> (inputs[i], output, numElem, mYoloV7NetWidth, mYoloV7NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount, outputElem); } } int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize); return 0; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 2); assert(strcmp(fc->fields[0].name, "netinfo") == 0); assert(strcmp(fc->fields[1].name, "kernels") == 0); int *p_netinfo = (int*)(fc->fields[0].data); int class_count = p_netinfo[0]; int input_w = p_netinfo[1]; int input_h = p_netinfo[2]; int max_output_object_count = p_netinfo[3]; std::vector kernels(fc->fields[1].length); memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(YoloKernel)); YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, kernels); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov7/plugin/yololayer.h ================================================ #pragma once #include "macros.h" #include "types.h" #include #include namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void*TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext( cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int nbInput, PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float *output, cudaStream_t stream, int batchSize = 1); int mThreadCount = 256; const char* mPluginNamespace; int mKernelCount; int mClassCount; int mYoloV7NetWidth; int mYoloV7NetHeight; int mMaxOutObject; std::vector mYoloKernel; void** mAnchor; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT override; IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov7/src/block.cpp ================================================ #include "block.h" #include "yololayer.h" #include "NvInfer.h" #include #include #include #include #include using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{ DataType::kFLOAT, nullptr, 0 }; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } static IScaleLayer* addBatchNorm2d(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } Weights scale{ DataType::kFLOAT, scval, len }; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } Weights shift{ DataType::kFLOAT, shval, len }; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } Weights power{ DataType::kFLOAT, pval, len }; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); assert(scale_1); return scale_1; } IElementWiseLayer* convBnSilu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, int k, int s, int p, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, c2, DimsHW{ k, k }, weightMap[lname + ".conv.weight"], emptywts); assert(conv1); conv1->setName((lname + ".conv").c_str()); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); // silu = x * sigmoid(x) IActivationLayer* sig1 = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID); assert(sig1); IElementWiseLayer* ew1 = network->addElementWise(*bn1->getOutput(0), *sig1->getOutput(0), ElementWiseOperation::kPROD); assert(ew1); return ew1; } ILayer* ReOrg(INetworkDefinition* network, std::map& weightMap, ITensor& input, int inch) { ISliceLayer* s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ISliceLayer* s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, kInputH / 2, kInputW / 2 }, Dims3{ 1, 2, 2 }); ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) }; auto cat = network->addConcatenation(inputTensors, 4); return cat; } ILayer* DownC(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, const std::string& lname) { int c_ = int(c2 * 0.5); IElementWiseLayer* cv1 = convBnSilu(network, weightMap, input, c1, 1, 1, 0, lname + ".cv1"); IElementWiseLayer* cv2 = convBnSilu(network, weightMap, *cv1->getOutput(0), c_, 3, 2, 1, lname + ".cv2"); IPoolingLayer* m1 = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{ 2, 2 }); m1->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* cv3 = convBnSilu(network, weightMap, *m1->getOutput(0), c_, 1, 1, 0, lname + ".cv3"); ITensor* input_tensors[] = { cv2->getOutput(0), cv3->getOutput(0) }; IConcatenationLayer* concat = network->addConcatenation(input_tensors, 2); return concat; } IElementWiseLayer* SPPCSPC(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, const std::string& lname) { int c_ = int(2 * c2 * 0.5); IElementWiseLayer* cv1 = convBnSilu(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1"); IElementWiseLayer* cv2 = convBnSilu(network, weightMap, input, c_, 1, 1, 0, lname + ".cv2"); IElementWiseLayer* cv3 = convBnSilu(network, weightMap, *cv1->getOutput(0), c_, 3, 1, 1, lname + ".cv3"); IElementWiseLayer* cv4 = convBnSilu(network, weightMap, *cv3->getOutput(0), c_, 1, 1, 0, lname + ".cv4"); IPoolingLayer* m1 = network->addPoolingNd(*cv4->getOutput(0), PoolingType::kMAX, DimsHW{ 5, 5 }); m1->setStrideNd(DimsHW{ 1, 1 }); m1->setPaddingNd(DimsHW{ 2, 2 }); IPoolingLayer* m2 = network->addPoolingNd(*cv4->getOutput(0), PoolingType::kMAX, DimsHW{ 9, 9 }); m2->setStrideNd(DimsHW{ 1, 1 }); m2->setPaddingNd(DimsHW{ 4, 4 }); IPoolingLayer* m3 = network->addPoolingNd(*cv4->getOutput(0), PoolingType::kMAX, DimsHW{ 13, 13 }); m3->setStrideNd(DimsHW{ 1, 1 }); m3->setPaddingNd(DimsHW{ 6, 6 }); ITensor* input_tensors[] = { cv4->getOutput(0), m1->getOutput(0), m2->getOutput(0), m3->getOutput(0) }; IConcatenationLayer* concat = network->addConcatenation(input_tensors, 4); // 0U concat->setAxis(0); IElementWiseLayer* cv5 = convBnSilu(network, weightMap, *concat->getOutput(0), c_, 1, 1, 0, lname + ".cv5"); IElementWiseLayer* cv6 = convBnSilu(network, weightMap, *cv5->getOutput(0), c_, 3, 1, 1, lname + ".cv6"); ITensor* input_tensors2[] = { cv6->getOutput(0), cv2->getOutput(0) }; IConcatenationLayer* concat1 = network->addConcatenation(input_tensors2, 2); // 0U concat1->setAxis(0); IElementWiseLayer* cv7 = convBnSilu(network, weightMap, *concat1->getOutput(0), c2, 1, 1, 0, lname + ".cv7"); return cv7; } IElementWiseLayer* RepConv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, int k, int s, const std::string& lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; // 256 * 128 * 3 *3 IConvolutionLayer* rbr_dense_conv = network->addConvolutionNd(input, c2, DimsHW{ k, k }, weightMap[lname + ".rbr_dense.0.weight"], emptywts); assert(rbr_dense_conv); rbr_dense_conv->setPaddingNd(DimsHW{ k / 2, k / 2 }); rbr_dense_conv->setStrideNd(DimsHW{ s, s }); rbr_dense_conv->setName((lname + ".rbr_dense.0").c_str()); IScaleLayer* rbr_dense_bn = addBatchNorm2d(network, weightMap, *rbr_dense_conv->getOutput(0), lname + ".rbr_dense.1", 1e-3); IConvolutionLayer* rbr_1x1_conv = network->addConvolutionNd(input, c2, DimsHW{ 1, 1 }, weightMap[lname + ".rbr_1x1.0.weight"], emptywts); assert(rbr_1x1_conv); rbr_1x1_conv->setStrideNd(DimsHW{ s, s }); rbr_1x1_conv->setName((lname + ".rbr_1x1.0").c_str()); IScaleLayer* rbr_1x1_bn = addBatchNorm2d(network, weightMap, *rbr_1x1_conv->getOutput(0), lname + ".rbr_1x1.1", 1e-3); IElementWiseLayer* ew1 = network->addElementWise(*rbr_dense_bn->getOutput(0), *rbr_1x1_bn->getOutput(0), ElementWiseOperation::kSUM); assert(ew1); // silu IActivationLayer* sigmoid = network->addActivation(*ew1->getOutput(0), ActivationType::kSIGMOID); IElementWiseLayer* ew2 = network->addElementWise(*ew1->getOutput(0), *sigmoid->getOutput(0), ElementWiseOperation::kPROD); return ew2; } IActivationLayer* convBlockLeakRelu(INetworkDefinition* network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, std::string lname) { Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts); assert(conv1); conv1->setName((lname + ".conv").c_str()); conv1->setStrideNd(DimsHW{ s, s }); conv1->setPaddingNd(DimsHW{ p, p }); //conv1->setNbGroups(g); IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-5); auto ew1 = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); ew1->setAlpha(0.1); return ew1; } static std::vector> getAnchors(std::map& weightMap, std::string lname) { std::vector> anchors; Weights wts = weightMap[lname + ".anchor_grid"]; int anchor_len = kNumAnchor * 2; for (int i = 0; i < wts.count / anchor_len; i++) { auto *p = (const float*)wts.values + i * anchor_len; std::vector anchor(p, p + anchor_len); anchors.push_back(anchor); } return anchors; } IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map& weightMap, std::string lname, std::vector dets) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); auto anchors = getAnchors(weightMap, lname); PluginField plugin_fields[2]; int netinfo[4] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox}; plugin_fields[0].data = netinfo; plugin_fields[0].length = 4; plugin_fields[0].name = "netinfo"; plugin_fields[0].type = PluginFieldType::kFLOAT32; int scale = 8; std::vector kernels; for (size_t i = 0; i < anchors.size(); i++) { YoloKernel kernel; kernel.width = kInputW / scale; kernel.height = kInputH / scale; memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float)); kernels.push_back(kernel); scale *= 2; } plugin_fields[1].data = &kernels[0]; plugin_fields[1].length = kernels.size(); plugin_fields[1].name = "kernels"; plugin_fields[1].type = PluginFieldType::kFLOAT32; PluginFieldCollection plugin_data; plugin_data.nbFields = 2; plugin_data.fields = plugin_fields; IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data); std::vector input_tensors; for (auto det: dets) { input_tensors.push_back(det->getOutput(0)); } auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); return yolo; } ================================================ FILE: yolov7/src/calibrator.cpp ================================================ #include #include #include #include #include "calibrator.h" #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize) , input_w_(input_w) , input_h_(input_h) , img_idx_(0) , img_dir_(img_dir) , calib_table_name_(calib_table_name) , input_blob_name_(input_blob_name) , read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()){ std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov7/src/model.cpp ================================================ #include "model.h" #include "block.h" // #include "yololayer.h" #include "config.h" #include "calibrator.h" #include #include using namespace nvinfer1; IHostMemory* build_engine_yolov7e6e(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) { std::map weightMap = loadWeights(wts_path); INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); auto* conv0 = ReOrg(network, weightMap, *data, 3); IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 80, 3, 1, 1, "model.1"); auto conv2 = DownC(network, weightMap, *conv1->getOutput(0), 80, 160, "model.2"); IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3"); IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4"); IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5"); IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6"); IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7"); IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8"); IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9"); IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10"); ITensor* input_tensor_11[] = { conv10->getOutput(0), conv8->getOutput(0),conv6->getOutput(0), conv4->getOutput(0), conv3->getOutput(0) }; IConcatenationLayer* concat11 = network->addConcatenation(input_tensor_11, 5); IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *concat11->getOutput(0), 160, 1, 1, 0, "model.12"); IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.13"); IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.14"); IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv14->getOutput(0), 64, 3, 1, 1, "model.15"); IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 64, 3, 1, 1, "model.16"); IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 64, 3, 1, 1, "model.17"); IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *conv17->getOutput(0), 64, 3, 1, 1, "model.18"); IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 64, 3, 1, 1, "model.19"); IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 64, 3, 1, 1, "model.20"); ITensor* input_tensor_21[] = { conv20->getOutput(0), conv18->getOutput(0),conv16->getOutput(0), conv14->getOutput(0), conv13->getOutput(0) }; IConcatenationLayer* concat21 = network->addConcatenation(input_tensor_21, 5); IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *concat21->getOutput(0), 160, 1, 1, 0, "model.22"); auto conv23 = network->addElementWise(*conv22->getOutput(0), *conv12->getOutput(0), ElementWiseOperation::kSUM); auto conv24 = DownC(network, weightMap, *conv23->getOutput(0), 160, 320, "model.24"); IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.25"); IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.26"); IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *conv26->getOutput(0), 128, 3, 1, 1, "model.27"); IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *conv27->getOutput(0), 128, 3, 1, 1, "model.28"); IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 128, 3, 1, 1, "model.29"); IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv29->getOutput(0), 128, 3, 1, 1, "model.30"); IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv30->getOutput(0), 128, 3, 1, 1, "model.31"); IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 128, 3, 1, 1, "model.32"); ITensor* input_tensor_33[] = { conv32->getOutput(0), conv30->getOutput(0),conv28->getOutput(0), conv26->getOutput(0), conv25->getOutput(0)}; IConcatenationLayer* concat33 = network->addConcatenation(input_tensor_33, 5); IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *concat33->getOutput(0), 320, 1, 1, 0, "model.34"); IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.35"); IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.36"); IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv36->getOutput(0), 128, 3, 1, 1, "model.37"); IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 128, 3, 1, 1, "model.38"); IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 128, 3, 1, 1, "model.39"); IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv39->getOutput(0), 128, 3, 1, 1, "model.40"); IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 128, 3, 1, 1, "model.41"); IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 128, 3, 1, 1, "model.42"); ITensor* input_tensor_43[] = { conv42->getOutput(0), conv40->getOutput(0),conv38->getOutput(0), conv36->getOutput(0), conv35->getOutput(0)}; IConcatenationLayer* concat43 = network->addConcatenation(input_tensor_43, 5); IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *concat43->getOutput(0), 320, 1, 1, 0, "model.44"); auto conv45 = network->addElementWise(*conv44->getOutput(0), *conv34->getOutput(0), ElementWiseOperation::kSUM); auto conv46 = DownC(network, weightMap, *conv45->getOutput(0), 320, 640, "model.46");//===== IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.47"); IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.48"); IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *conv48->getOutput(0), 256, 3, 1, 1, "model.49"); IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv49->getOutput(0), 256, 3, 1, 1, "model.50"); IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 256, 3, 1, 1, "model.51"); IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 256, 3, 1, 1, "model.52"); IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *conv52->getOutput(0), 256, 3, 1, 1, "model.53"); IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 256, 3, 1, 1, "model.54"); ITensor* input_tensor_55[] = { conv54->getOutput(0), conv52->getOutput(0),conv50->getOutput(0), conv48->getOutput(0), conv47->getOutput(0) }; IConcatenationLayer* concat55 = network->addConcatenation(input_tensor_55, 5); IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *concat55->getOutput(0), 640, 1, 1, 0, "model.56"); IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.57"); IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 1, 1, 0, "model.58"); IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *conv58->getOutput(0), 256, 3, 1, 1, "model.59"); IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 256, 3, 1, 1, "model.60"); IElementWiseLayer* conv61 = convBnSilu(network, weightMap, *conv60->getOutput(0), 256, 3, 1, 1, "model.61"); IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv61->getOutput(0), 256, 3, 1, 1, "model.62"); IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *conv62->getOutput(0), 256, 3, 1, 1, "model.63"); IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 256, 3, 1, 1, "model.64"); ITensor* input_tensor_65[] = { conv64->getOutput(0), conv62->getOutput(0),conv60->getOutput(0), conv58->getOutput(0), conv57->getOutput(0) }; IConcatenationLayer* concat65 = network->addConcatenation(input_tensor_65, 5); IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *concat65->getOutput(0), 640, 1, 1, 0, "model.66"); auto conv67 = network->addElementWise(*conv66->getOutput(0), *conv56->getOutput(0), ElementWiseOperation::kSUM); auto conv68 = DownC(network, weightMap, *conv67->getOutput(0), 640, 960, "model.68");//===== IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.69"); IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.70"); IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *conv70->getOutput(0), 384, 3, 1, 1, "model.71"); IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 384, 3, 1, 1, "model.72"); IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *conv72->getOutput(0), 384, 3, 1, 1, "model.73"); IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv73->getOutput(0), 384, 3, 1, 1, "model.74"); IElementWiseLayer* conv75 = convBnSilu(network, weightMap, *conv74->getOutput(0), 384, 3, 1, 1, "model.75"); IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *conv75->getOutput(0), 384, 3, 1, 1, "model.76"); ITensor* input_tensor_77[] = { conv76->getOutput(0), conv74->getOutput(0),conv72->getOutput(0), conv70->getOutput(0), conv69->getOutput(0) }; IConcatenationLayer* concat77 = network->addConcatenation(input_tensor_77, 5); IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *concat77->getOutput(0), 960, 1, 1, 0, "model.78"); IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.79"); IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv68->getOutput(0), 384, 1, 1, 0, "model.80"); IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 384, 3, 1, 1, "model.81"); IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *conv81->getOutput(0), 384, 3, 1, 1, "model.82"); IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 384, 3, 1, 1, "model.83"); IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 384, 3, 1, 1, "model.84"); IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *conv84->getOutput(0), 384, 3, 1, 1, "model.85"); IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv85->getOutput(0), 384, 3, 1, 1, "model.86"); ITensor* input_tensor_87[] = { conv86->getOutput(0), conv84->getOutput(0),conv82->getOutput(0), conv80->getOutput(0), conv79->getOutput(0) }; IConcatenationLayer* concat87 = network->addConcatenation(input_tensor_87, 5); IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *concat87->getOutput(0), 960, 1, 1, 0, "model.88"); auto conv89 = network->addElementWise(*conv88->getOutput(0), *conv78->getOutput(0), ElementWiseOperation::kSUM); auto conv90 = DownC(network, weightMap, *conv89->getOutput(0), 960, 1280, "model.90"); IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.91"); IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.92"); IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *conv92->getOutput(0), 512, 3, 1, 1, "model.93"); IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 512, 3, 1, 1, "model.94"); IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 512, 3, 1, 1, "model.95"); IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 512, 3, 1, 1, "model.96"); IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 512, 3, 1, 1, "model.97"); IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 512, 3, 1, 1, "model.98"); ITensor* input_tensor_99[] = { conv98->getOutput(0), conv96->getOutput(0),conv94->getOutput(0), conv92->getOutput(0), conv91->getOutput(0) }; IConcatenationLayer* concat99 = network->addConcatenation(input_tensor_99, 5); IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *concat99->getOutput(0), 1280, 1, 1, 0, "model.100"); IElementWiseLayer* conv101 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.101"); IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *conv90->getOutput(0), 512, 1, 1, 0, "model.102"); IElementWiseLayer* conv103 = convBnSilu(network, weightMap, *conv102->getOutput(0), 512, 3, 1, 1, "model.103"); IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *conv103->getOutput(0), 512, 3, 1, 1, "model.104"); IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *conv104->getOutput(0), 512, 3, 1, 1, "model.105"); IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 512, 3, 1, 1, "model.106"); IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *conv106->getOutput(0), 512, 3, 1, 1, "model.107"); IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 512, 3, 1, 1, "model.108"); ITensor* input_tensor_109[] = { conv108->getOutput(0), conv106->getOutput(0),conv104->getOutput(0), conv102->getOutput(0), conv101->getOutput(0) }; IConcatenationLayer* concat109 = network->addConcatenation(input_tensor_109, 5); IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *concat109->getOutput(0), 1280, 1, 1, 0, "model.110"); auto conv111 = network->addElementWise(*conv110->getOutput(0), *conv100->getOutput(0), ElementWiseOperation::kSUM); //---------------------------yolov7e6e head--------------------------------- auto conv112 = SPPCSPC(network, weightMap, *conv111->getOutput(0), 640, "model.112"); IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *conv112->getOutput(0), 480, 1, 1, 0, "model.113"); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* re114 = network->addResize(*conv113->getOutput(0)); re114->setResizeMode(ResizeMode::kNEAREST); re114->setScales(scale, 3); IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *conv89->getOutput(0), 480, 1, 1, 0, "model.115"); ITensor* input_tensor_116[] = { conv115->getOutput(0), re114->getOutput(0) }; IConcatenationLayer* concat116 = network->addConcatenation(input_tensor_116, 2); IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.117"); IElementWiseLayer* conv118 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.118"); IElementWiseLayer* conv119 = convBnSilu(network, weightMap, *conv118->getOutput(0), 192, 3, 1, 1, "model.119"); IElementWiseLayer* conv120 = convBnSilu(network, weightMap, *conv119->getOutput(0), 192, 3, 1, 1, "model.120"); IElementWiseLayer* conv121 = convBnSilu(network, weightMap, *conv120->getOutput(0), 192, 3, 1, 1, "model.121"); IElementWiseLayer* conv122 = convBnSilu(network, weightMap, *conv121->getOutput(0), 192, 3, 1, 1, "model.122"); IElementWiseLayer* conv123 = convBnSilu(network, weightMap, *conv122->getOutput(0), 192, 3, 1, 1, "model.123"); IElementWiseLayer* conv124 = convBnSilu(network, weightMap, *conv123->getOutput(0), 192, 3, 1, 1, "model.124"); ITensor* input_tensor_125[] = { conv124->getOutput(0), conv123->getOutput(0),conv122->getOutput(0), conv121->getOutput(0), conv120->getOutput(0), conv119->getOutput(0), conv118->getOutput(0), conv117->getOutput(0) }; IConcatenationLayer* concat125 = network->addConcatenation(input_tensor_125, 8); IElementWiseLayer* conv126 = convBnSilu(network, weightMap, *concat125->getOutput(0), 480, 1, 1, 0, "model.126"); IElementWiseLayer* conv127 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.127"); IElementWiseLayer* conv128 = convBnSilu(network, weightMap, *concat116->getOutput(0), 384, 1, 1, 0, "model.128"); IElementWiseLayer* conv129 = convBnSilu(network, weightMap, *conv128->getOutput(0), 192, 3, 1, 1, "model.129"); IElementWiseLayer* conv130 = convBnSilu(network, weightMap, *conv129->getOutput(0), 192, 3, 1, 1, "model.130"); IElementWiseLayer* conv131 = convBnSilu(network, weightMap, *conv130->getOutput(0), 192, 3, 1, 1, "model.131"); IElementWiseLayer* conv132 = convBnSilu(network, weightMap, *conv131->getOutput(0), 192, 3, 1, 1, "model.132"); IElementWiseLayer* conv133 = convBnSilu(network, weightMap, *conv132->getOutput(0), 192, 3, 1, 1, "model.133"); IElementWiseLayer* conv134 = convBnSilu(network, weightMap, *conv133->getOutput(0), 192, 3, 1, 1, "model.134"); ITensor* input_tensor_135[] = { conv134->getOutput(0), conv133->getOutput(0),conv132->getOutput(0), conv131->getOutput(0), conv130->getOutput(0), conv129->getOutput(0), conv128->getOutput(0), conv127->getOutput(0) }; IConcatenationLayer* concat135 = network->addConcatenation(input_tensor_135, 8); IElementWiseLayer* conv136 = convBnSilu(network, weightMap, *concat135->getOutput(0), 480, 1, 1, 0, "model.136"); auto conv137 = network->addElementWise(*conv136->getOutput(0), *conv126->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* conv138 = convBnSilu(network, weightMap, *conv137->getOutput(0), 320, 1, 1, 0, "model.138"); IResizeLayer* re139 = network->addResize(*conv138->getOutput(0)); re139->setResizeMode(ResizeMode::kNEAREST); re139->setScales(scale, 3); IElementWiseLayer* conv140 = convBnSilu(network, weightMap, *conv67->getOutput(0), 320, 1, 1, 0, "model.140"); ITensor* input_tensor_141[] = { conv140->getOutput(0), re139->getOutput(0) }; IConcatenationLayer* concat141 = network->addConcatenation(input_tensor_141, 2); IElementWiseLayer* conv142 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.142"); IElementWiseLayer* conv143 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.143"); IElementWiseLayer* conv144 = convBnSilu(network, weightMap, *conv143->getOutput(0), 128, 3, 1, 1, "model.144"); IElementWiseLayer* conv145 = convBnSilu(network, weightMap, *conv144->getOutput(0), 128, 3, 1, 1, "model.145"); IElementWiseLayer* conv146 = convBnSilu(network, weightMap, *conv145->getOutput(0), 128, 3, 1, 1, "model.146"); IElementWiseLayer* conv147 = convBnSilu(network, weightMap, *conv146->getOutput(0), 128, 3, 1, 1, "model.147"); IElementWiseLayer* conv148 = convBnSilu(network, weightMap, *conv147->getOutput(0), 128, 3, 1, 1, "model.148"); IElementWiseLayer* conv149 = convBnSilu(network, weightMap, *conv148->getOutput(0), 128, 3, 1, 1, "model.149"); ITensor* input_tensor_150[] = { conv149->getOutput(0), conv148->getOutput(0),conv147->getOutput(0), conv146->getOutput(0), conv145->getOutput(0), conv144->getOutput(0), conv143->getOutput(0), conv142->getOutput(0) }; IConcatenationLayer* concat150 = network->addConcatenation(input_tensor_150, 8); IElementWiseLayer* conv151 = convBnSilu(network, weightMap, *concat150->getOutput(0), 320, 1, 1, 0, "model.151"); IElementWiseLayer* conv152 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.152"); IElementWiseLayer* conv153 = convBnSilu(network, weightMap, *concat141->getOutput(0), 256, 1, 1, 0, "model.153"); IElementWiseLayer* conv154 = convBnSilu(network, weightMap, *conv153->getOutput(0), 128, 3, 1, 1, "model.154"); IElementWiseLayer* conv155 = convBnSilu(network, weightMap, *conv154->getOutput(0), 128, 3, 1, 1, "model.155"); IElementWiseLayer* conv156 = convBnSilu(network, weightMap, *conv155->getOutput(0), 128, 3, 1, 1, "model.156"); IElementWiseLayer* conv157 = convBnSilu(network, weightMap, *conv156->getOutput(0), 128, 3, 1, 1, "model.157"); IElementWiseLayer* conv158 = convBnSilu(network, weightMap, *conv157->getOutput(0), 128, 3, 1, 1, "model.158"); IElementWiseLayer* conv159 = convBnSilu(network, weightMap, *conv158->getOutput(0), 128, 3, 1, 1, "model.159"); ITensor* input_tensor_160[] = { conv159->getOutput(0), conv158->getOutput(0),conv157->getOutput(0), conv156->getOutput(0), conv155->getOutput(0), conv154->getOutput(0), conv153->getOutput(0), conv152->getOutput(0) }; IConcatenationLayer* concat160 = network->addConcatenation(input_tensor_160, 8); IElementWiseLayer* conv161 = convBnSilu(network, weightMap, *concat160->getOutput(0), 320, 1, 1, 0, "model.161"); auto conv162 = network->addElementWise(*conv161->getOutput(0), *conv151->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* conv163 = convBnSilu(network, weightMap, *conv162->getOutput(0), 160, 1, 1, 0, "model.163"); IResizeLayer* re164 = network->addResize(*conv163->getOutput(0)); re164->setResizeMode(ResizeMode::kNEAREST); re164->setScales(scale, 3); IElementWiseLayer* conv165 = convBnSilu(network, weightMap, *conv45->getOutput(0), 160, 1, 1, 0, "model.165"); ITensor* input_tensor_166[] = { conv165->getOutput(0), re164->getOutput(0) }; IConcatenationLayer* concat166 = network->addConcatenation(input_tensor_166, 2); IElementWiseLayer* conv167 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.167"); IElementWiseLayer* conv168 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.168"); IElementWiseLayer* conv169 = convBnSilu(network, weightMap, *conv168->getOutput(0), 64, 3, 1, 1, "model.169"); IElementWiseLayer* conv170 = convBnSilu(network, weightMap, *conv169->getOutput(0), 64, 3, 1, 1, "model.170"); IElementWiseLayer* conv171 = convBnSilu(network, weightMap, *conv170->getOutput(0), 64, 3, 1, 1, "model.171"); IElementWiseLayer* conv172 = convBnSilu(network, weightMap, *conv171->getOutput(0), 64, 3, 1, 1, "model.172"); IElementWiseLayer* conv173 = convBnSilu(network, weightMap, *conv172->getOutput(0), 64, 3, 1, 1, "model.173"); IElementWiseLayer* conv174 = convBnSilu(network, weightMap, *conv173->getOutput(0), 64, 3, 1, 1, "model.174"); ITensor* input_tensor_175[] = { conv174->getOutput(0), conv173->getOutput(0),conv172->getOutput(0), conv171->getOutput(0), conv170->getOutput(0), conv169->getOutput(0), conv168->getOutput(0), conv167->getOutput(0) }; IConcatenationLayer* concat175 = network->addConcatenation(input_tensor_175, 8); IElementWiseLayer* conv176 = convBnSilu(network, weightMap, *concat175->getOutput(0), 160, 1, 1, 0, "model.176"); IElementWiseLayer* conv177 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.177"); IElementWiseLayer* conv178 = convBnSilu(network, weightMap, *concat166->getOutput(0), 128, 1, 1, 0, "model.178"); IElementWiseLayer* conv179 = convBnSilu(network, weightMap, *conv178->getOutput(0), 64, 3, 1, 1, "model.179"); IElementWiseLayer* conv180 = convBnSilu(network, weightMap, *conv179->getOutput(0), 64, 3, 1, 1, "model.180"); IElementWiseLayer* conv181 = convBnSilu(network, weightMap, *conv180->getOutput(0), 64, 3, 1, 1, "model.181"); IElementWiseLayer* conv182 = convBnSilu(network, weightMap, *conv181->getOutput(0), 64, 3, 1, 1, "model.182"); IElementWiseLayer* conv183 = convBnSilu(network, weightMap, *conv182->getOutput(0), 64, 3, 1, 1, "model.183"); IElementWiseLayer* conv184 = convBnSilu(network, weightMap, *conv183->getOutput(0), 64, 3, 1, 1, "model.184"); ITensor* input_tensor_185[] = { conv184->getOutput(0), conv183->getOutput(0),conv182->getOutput(0), conv181->getOutput(0), conv180->getOutput(0), conv179->getOutput(0), conv178->getOutput(0), conv177->getOutput(0) }; IConcatenationLayer* concat185 = network->addConcatenation(input_tensor_185, 8); IElementWiseLayer* conv186 = convBnSilu(network, weightMap, *concat185->getOutput(0), 160, 1, 1, 0, "model.186"); auto conv187 = network->addElementWise(*conv186->getOutput(0), *conv176->getOutput(0), ElementWiseOperation::kSUM); auto conv188 = DownC(network, weightMap, *conv187->getOutput(0), 160, 320, "model.188"); ITensor* input_tensor_189[] = { conv188->getOutput(0), conv162->getOutput(0) }; IConcatenationLayer* concat189 = network->addConcatenation(input_tensor_189, 2); IElementWiseLayer* conv190 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.190"); IElementWiseLayer* conv191 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.191"); IElementWiseLayer* conv192 = convBnSilu(network, weightMap, *conv191->getOutput(0), 128, 3, 1, 1, "model.192"); IElementWiseLayer* conv193 = convBnSilu(network, weightMap, *conv192->getOutput(0), 128, 3, 1, 1, "model.193"); IElementWiseLayer* conv194 = convBnSilu(network, weightMap, *conv193->getOutput(0), 128, 3, 1, 1, "model.194"); IElementWiseLayer* conv195 = convBnSilu(network, weightMap, *conv194->getOutput(0), 128, 3, 1, 1, "model.195"); IElementWiseLayer* conv196 = convBnSilu(network, weightMap, *conv195->getOutput(0), 128, 3, 1, 1, "model.196"); IElementWiseLayer* conv197 = convBnSilu(network, weightMap, *conv196->getOutput(0), 128, 3, 1, 1, "model.197"); ITensor* input_tensor_198[] = { conv197->getOutput(0), conv196->getOutput(0),conv195->getOutput(0), conv194->getOutput(0), conv193->getOutput(0), conv192->getOutput(0), conv191->getOutput(0), conv190->getOutput(0) }; IConcatenationLayer* concat198 = network->addConcatenation(input_tensor_198, 8); IElementWiseLayer* conv199 = convBnSilu(network, weightMap, *concat198->getOutput(0), 320, 1, 1, 0, "model.199"); IElementWiseLayer* conv200 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.200"); IElementWiseLayer* conv201 = convBnSilu(network, weightMap, *concat189->getOutput(0), 256, 1, 1, 0, "model.201"); IElementWiseLayer* conv202 = convBnSilu(network, weightMap, *conv201->getOutput(0), 128, 3, 1, 1, "model.202"); IElementWiseLayer* conv203 = convBnSilu(network, weightMap, *conv202->getOutput(0), 128, 3, 1, 1, "model.203"); IElementWiseLayer* conv204 = convBnSilu(network, weightMap, *conv203->getOutput(0), 128, 3, 1, 1, "model.204"); IElementWiseLayer* conv205 = convBnSilu(network, weightMap, *conv204->getOutput(0), 128, 3, 1, 1, "model.205"); IElementWiseLayer* conv206 = convBnSilu(network, weightMap, *conv205->getOutput(0), 128, 3, 1, 1, "model.206"); IElementWiseLayer* conv207 = convBnSilu(network, weightMap, *conv206->getOutput(0), 128, 3, 1, 1, "model.207"); ITensor* input_tensor_208[] = { conv207->getOutput(0), conv206->getOutput(0),conv205->getOutput(0), conv204->getOutput(0), conv203->getOutput(0), conv202->getOutput(0), conv201->getOutput(0), conv200->getOutput(0) }; IConcatenationLayer* concat208 = network->addConcatenation(input_tensor_208, 8); IElementWiseLayer* conv209 = convBnSilu(network, weightMap, *concat208->getOutput(0), 320, 1, 1, 0, "model.209"); auto conv210 = network->addElementWise(*conv209->getOutput(0), *conv199->getOutput(0), ElementWiseOperation::kSUM); auto conv211 = DownC(network, weightMap, *conv210->getOutput(0), 320, 480, "model.211"); ITensor* input_tensor_212[] = { conv211->getOutput(0), conv137->getOutput(0) }; IConcatenationLayer* concat212 = network->addConcatenation(input_tensor_212, 2); IElementWiseLayer* conv213 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.213"); IElementWiseLayer* conv214 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.214"); IElementWiseLayer* conv215 = convBnSilu(network, weightMap, *conv214->getOutput(0), 192, 3, 1, 1, "model.215"); IElementWiseLayer* conv216 = convBnSilu(network, weightMap, *conv215->getOutput(0), 192, 3, 1, 1, "model.216"); IElementWiseLayer* conv217 = convBnSilu(network, weightMap, *conv216->getOutput(0), 192, 3, 1, 1, "model.217"); IElementWiseLayer* conv218 = convBnSilu(network, weightMap, *conv217->getOutput(0), 192, 3, 1, 1, "model.218"); IElementWiseLayer* conv219 = convBnSilu(network, weightMap, *conv218->getOutput(0), 192, 3, 1, 1, "model.219"); IElementWiseLayer* conv220 = convBnSilu(network, weightMap, *conv219->getOutput(0), 192, 3, 1, 1, "model.220"); ITensor* input_tensor_221[] = { conv220->getOutput(0), conv219->getOutput(0),conv218->getOutput(0), conv217->getOutput(0), conv216->getOutput(0), conv215->getOutput(0), conv214->getOutput(0), conv213->getOutput(0) }; IConcatenationLayer* concat221 = network->addConcatenation(input_tensor_221, 8); IElementWiseLayer* conv222 = convBnSilu(network, weightMap, *concat221->getOutput(0), 480, 1, 1, 0, "model.222"); IElementWiseLayer* conv223 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.223"); IElementWiseLayer* conv224 = convBnSilu(network, weightMap, *concat212->getOutput(0), 384, 1, 1, 0, "model.224"); IElementWiseLayer* conv225 = convBnSilu(network, weightMap, *conv224->getOutput(0), 192, 3, 1, 1, "model.225"); IElementWiseLayer* conv226 = convBnSilu(network, weightMap, *conv225->getOutput(0), 192, 3, 1, 1, "model.226"); IElementWiseLayer* conv227 = convBnSilu(network, weightMap, *conv226->getOutput(0), 192, 3, 1, 1, "model.227"); IElementWiseLayer* conv228 = convBnSilu(network, weightMap, *conv227->getOutput(0), 192, 3, 1, 1, "model.228"); IElementWiseLayer* conv229 = convBnSilu(network, weightMap, *conv228->getOutput(0), 192, 3, 1, 1, "model.229"); IElementWiseLayer* conv230 = convBnSilu(network, weightMap, *conv229->getOutput(0), 192, 3, 1, 1, "model.230"); ITensor* input_tensor_231[] = { conv230->getOutput(0), conv229->getOutput(0),conv228->getOutput(0), conv227->getOutput(0), conv226->getOutput(0), conv225->getOutput(0), conv224->getOutput(0), conv223->getOutput(0) }; IConcatenationLayer* concat231 = network->addConcatenation(input_tensor_231, 8); IElementWiseLayer* conv232 = convBnSilu(network, weightMap, *concat231->getOutput(0), 480, 1, 1, 0, "model.232"); auto conv233 = network->addElementWise(*conv232->getOutput(0), *conv222->getOutput(0), ElementWiseOperation::kSUM); auto conv234 = DownC(network, weightMap, *conv233->getOutput(0), 480, 640, "model.234"); ITensor* input_tensor_235[] = { conv234->getOutput(0), conv112->getOutput(0) }; IConcatenationLayer* concat235 = network->addConcatenation(input_tensor_235, 2); IElementWiseLayer* conv236 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.236"); IElementWiseLayer* conv237 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.237"); IElementWiseLayer* conv238 = convBnSilu(network, weightMap, *conv237->getOutput(0), 256, 3, 1, 1, "model.238"); IElementWiseLayer* conv239 = convBnSilu(network, weightMap, *conv238->getOutput(0), 256, 3, 1, 1, "model.239"); IElementWiseLayer* conv240 = convBnSilu(network, weightMap, *conv239->getOutput(0), 256, 3, 1, 1, "model.240"); IElementWiseLayer* conv241 = convBnSilu(network, weightMap, *conv240->getOutput(0), 256, 3, 1, 1, "model.241"); IElementWiseLayer* conv242 = convBnSilu(network, weightMap, *conv241->getOutput(0), 256, 3, 1, 1, "model.242"); IElementWiseLayer* conv243 = convBnSilu(network, weightMap, *conv242->getOutput(0), 256, 3, 1, 1, "model.243"); ITensor* input_tensor_244[] = { conv243->getOutput(0), conv242->getOutput(0),conv241->getOutput(0), conv240->getOutput(0), conv239->getOutput(0), conv238->getOutput(0), conv237->getOutput(0), conv236->getOutput(0) }; IConcatenationLayer* concat244 = network->addConcatenation(input_tensor_244, 8); IElementWiseLayer* conv245 = convBnSilu(network, weightMap, *concat244->getOutput(0), 640, 1, 1, 0, "model.245"); IElementWiseLayer* conv246 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.246"); IElementWiseLayer* conv247 = convBnSilu(network, weightMap, *concat235->getOutput(0), 512, 1, 1, 0, "model.247"); IElementWiseLayer* conv248 = convBnSilu(network, weightMap, *conv247->getOutput(0), 256, 3, 1, 1, "model.248"); IElementWiseLayer* conv249 = convBnSilu(network, weightMap, *conv248->getOutput(0), 256, 3, 1, 1, "model.249"); IElementWiseLayer* conv250 = convBnSilu(network, weightMap, *conv249->getOutput(0), 256, 3, 1, 1, "model.250"); IElementWiseLayer* conv251 = convBnSilu(network, weightMap, *conv250->getOutput(0), 256, 3, 1, 1, "model.251"); IElementWiseLayer* conv252 = convBnSilu(network, weightMap, *conv251->getOutput(0), 256, 3, 1, 1, "model.252"); IElementWiseLayer* conv253 = convBnSilu(network, weightMap, *conv252->getOutput(0), 256, 3, 1, 1, "model.253"); ITensor* input_tensor_254[] = { conv253->getOutput(0), conv252->getOutput(0),conv251->getOutput(0), conv250->getOutput(0), conv249->getOutput(0), conv248->getOutput(0), conv247->getOutput(0), conv246->getOutput(0) }; IConcatenationLayer* concat254 = network->addConcatenation(input_tensor_254, 8); IElementWiseLayer* conv255= convBnSilu(network, weightMap, *concat254->getOutput(0), 640, 1, 1, 0, "model.255"); auto conv256 = network->addElementWise(*conv255->getOutput(0), *conv245->getOutput(0), ElementWiseOperation::kSUM); IElementWiseLayer* conv257 = convBnSilu(network, weightMap, *conv187->getOutput(0), 320, 3, 1, 1, "model.257"); IElementWiseLayer* conv258 = convBnSilu(network, weightMap, *conv210->getOutput(0), 640, 3, 1, 1, "model.258"); IElementWiseLayer* conv259 = convBnSilu(network, weightMap, *conv233->getOutput(0), 960, 3, 1, 1, "model.259"); IElementWiseLayer* conv260 = convBnSilu(network, weightMap, *conv256->getOutput(0), 1280, 3, 1, 1, "model.260"); // out IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv257->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.0.weight"], weightMap["model.261.m.0.bias"]); assert(cv105_0); cv105_0->setName("cv105.0"); IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv258->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.1.weight"], weightMap["model.261.m.1.bias"]); assert(cv105_1); cv105_1->setName("cv105.1"); IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv259->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.2.weight"], weightMap["model.261.m.2.bias"]); assert(cv105_2); cv105_2->setName("cv105.2"); IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv260->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.261.m.3.weight"], weightMap["model.261.m.3.bias"]); assert(cv105_3); cv105_3->setName("cv105.3"); /*------------detect-----------*/ auto yolo = addYoLoLayer(network, weightMap, "model.261", std::vector{cv105_0, cv105_1, cv105_2, cv105_3}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov7d6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) { std::map weightMap = loadWeights(wts_path); INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); /*----------------------------------yolov7d6 backbone-----------------------------------------*/ auto* conv0 = ReOrg(network, weightMap, *data, 3); IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 96, 3, 1, 1, "model.1"); auto conv2 = DownC(network, weightMap, *conv1->getOutput(0), 96, 192, "model.2"); IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3"); IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4"); IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5"); IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6"); IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7"); IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8"); IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9"); IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10"); IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *conv10->getOutput(0), 64, 3, 1, 1, "model.11"); IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *conv11->getOutput(0), 64, 3, 1, 1, "model.12"); ITensor* input_tensor_13[] = { conv12->getOutput(0), conv10->getOutput(0),conv8->getOutput(0), conv6->getOutput(0), conv4->getOutput(0),conv3->getOutput(0) }; IConcatenationLayer* concat13 = network->addConcatenation(input_tensor_13, 6); IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *concat13->getOutput(0), 192, 1, 1, 0, "model.14"); auto conv15 = DownC(network, weightMap, *conv14->getOutput(0), 192, 384, "model.15"); IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 1, 1, 0, "model.16"); IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 1, 1, 0, "model.17"); IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *conv17->getOutput(0), 128, 3, 1, 1, "model.18"); IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19"); IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 3, 1, 1, "model.20"); IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21"); IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv21->getOutput(0), 128, 3, 1, 1, "model.22"); IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *conv22->getOutput(0), 128, 3, 1, 1, "model.23"); IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *conv23->getOutput(0), 128, 3, 1, 1, "model.24"); IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 3, 1, 1, "model.25"); ITensor* input_tensor_26[] = { conv25->getOutput(0), conv23->getOutput(0),conv21->getOutput(0), conv19->getOutput(0), conv17->getOutput(0),conv16->getOutput(0) }; IConcatenationLayer* concat26 = network->addConcatenation(input_tensor_26, 6); IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *concat26->getOutput(0), 384, 1, 1, 0, "model.27"); auto conv28 = DownC(network, weightMap, *conv27->getOutput(0), 384, 768, "model.28"); IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.29"); IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.30"); IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv30->getOutput(0), 256, 3, 1, 1, "model.31"); IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 256, 3, 1, 1, "model.32"); IElementWiseLayer* conv33 = convBnSilu(network, weightMap, *conv32->getOutput(0), 256, 3, 1, 1, "model.33"); IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *conv33->getOutput(0), 256, 3, 1, 1, "model.34"); IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv34->getOutput(0), 256, 3, 1, 1, "model.35"); IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv35->getOutput(0), 256, 3, 1, 1, "model.36"); IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv36->getOutput(0), 256, 3, 1, 1, "model.37"); IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 256, 3, 1, 1, "model.38"); ITensor* input_tensor_39[] = { conv38->getOutput(0), conv36->getOutput(0),conv34->getOutput(0), conv32->getOutput(0), conv30->getOutput(0), conv29 ->getOutput(0)}; IConcatenationLayer* concat39 = network->addConcatenation(input_tensor_39, 6); IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *concat39->getOutput(0), 768, 1, 1, 0, "model.40"); auto conv41 = DownC(network, weightMap, *conv40->getOutput(0), 768, 1152, "model.41"); IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 384, 1, 1, 0, "model.42"); IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *conv41->getOutput(0), 384, 1, 1, 0, "model.43"); IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *conv43->getOutput(0), 384, 3, 1, 1, "model.44"); IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *conv44->getOutput(0), 384, 3, 1, 1, "model.45"); IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *conv45->getOutput(0), 384, 3, 1, 1, "model.46"); IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 384, 3, 1, 1, "model.47"); IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv47->getOutput(0), 384, 3, 1, 1, "model.48"); IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *conv48->getOutput(0), 384, 3, 1, 1, "model.49"); IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv49->getOutput(0), 384, 3, 1, 1, "model.50"); IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 384, 3, 1, 1, "model.51"); ITensor* input_tensor_52[] = { conv51->getOutput(0), conv49->getOutput(0),conv47->getOutput(0), conv45->getOutput(0), conv43->getOutput(0),conv42->getOutput(0) }; IConcatenationLayer* concat52 = network->addConcatenation(input_tensor_52, 6); IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *concat52->getOutput(0), 1152, 1, 1, 0, "model.53"); auto conv54 = DownC(network, weightMap, *conv53->getOutput(0), 1152, 1536, "model.54");//===== IElementWiseLayer* conv55 = convBnSilu(network, weightMap, *conv54->getOutput(0), 512, 1, 1, 0, "model.55"); IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *conv54->getOutput(0), 512, 1, 1, 0, "model.56"); IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *conv56->getOutput(0), 512, 3, 1, 1, "model.57"); IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv57->getOutput(0), 512, 3, 1, 1, "model.58"); IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *conv58->getOutput(0), 512, 3, 1, 1, "model.59"); IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 512, 3, 1, 1, "model.60"); IElementWiseLayer* conv61 = convBnSilu(network, weightMap, *conv60->getOutput(0), 512, 3, 1, 1, "model.61"); IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv61->getOutput(0), 512, 3, 1, 1, "model.62"); IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *conv62->getOutput(0), 512, 3, 1, 1, "model.63"); IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 512, 3, 1, 1, "model.64"); ITensor* input_tensor_65[] = { conv64->getOutput(0), conv62->getOutput(0),conv60->getOutput(0), conv58->getOutput(0), conv56->getOutput(0),conv55->getOutput(0) }; IConcatenationLayer* concat65 = network->addConcatenation(input_tensor_65, 6); IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *concat65->getOutput(0), 1536, 1, 1, 0, "model.66"); //------------------------yolov7e6 head------------------------------- auto conv67 = SPPCSPC(network, weightMap, *conv66->getOutput(0), 768, "model.67"); IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 576, 1, 1, 0, "model.68"); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* re69 = network->addResize(*conv68->getOutput(0)); re69->setResizeMode(ResizeMode::kNEAREST); re69->setScales(scale, 3); IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv53->getOutput(0), 576, 1, 1, 0, "model.70"); ITensor* input_tensor_71[] = { conv70->getOutput(0), re69->getOutput(0) }; IConcatenationLayer* concat71 = network->addConcatenation(input_tensor_71, 2); IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *concat71->getOutput(0), 384, 1, 1, 0, "model.72"); IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *concat71->getOutput(0), 384, 1, 1, 0, "model.73"); IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv73->getOutput(0), 192, 3, 1, 1, "model.74"); IElementWiseLayer* conv75 = convBnSilu(network, weightMap, *conv74->getOutput(0), 192, 3, 1, 1, "model.75"); IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *conv75->getOutput(0), 192, 3, 1, 1, "model.76"); IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *conv76->getOutput(0), 192, 3, 1, 1, "model.77"); IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv77->getOutput(0), 192, 3, 1, 1, "model.78"); IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 192, 3, 1, 1, "model.79"); IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 192, 3, 1, 1, "model.80"); IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 192, 3, 1, 1, "model.81"); ITensor* input_tensor_82[] = { conv81->getOutput(0), conv80->getOutput(0),conv79->getOutput(0), conv78->getOutput(0), conv77->getOutput(0), conv76->getOutput(0), conv75->getOutput(0), conv74->getOutput(0), conv73->getOutput(0), conv72->getOutput(0) }; IConcatenationLayer* concat82 = network->addConcatenation(input_tensor_82, 10); IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *concat82->getOutput(0), 576, 1, 1, 0, "model.83"); IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 384, 1, 1, 0, "model.84"); IResizeLayer* re85 = network->addResize(*conv84->getOutput(0)); re85->setResizeMode(ResizeMode::kNEAREST); re85->setScales(scale, 3); IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv40->getOutput(0), 384, 1, 1, 0, "model.86"); ITensor* input_tensor_87[] = { conv86->getOutput(0), re85->getOutput(0) }; IConcatenationLayer* concat87 = network->addConcatenation(input_tensor_87, 2); IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *concat87->getOutput(0), 256, 1, 1, 0, "model.88"); IElementWiseLayer* conv89 = convBnSilu(network, weightMap, *concat87->getOutput(0), 256, 1, 1, 0, "model.89"); IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *conv89->getOutput(0), 128, 3, 1, 1, "model.90"); IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 128, 3, 1, 1, "model.91"); IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv91->getOutput(0), 128, 3, 1, 1, "model.92"); IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *conv92->getOutput(0), 128, 3, 1, 1, "model.93"); IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 128, 3, 1, 1, "model.94"); IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 128, 3, 1, 1, "model.95"); IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 128, 3, 1, 1, "model.96"); IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 128, 3, 1, 1, "model.97"); ITensor* input_tensor_98[] = { conv97->getOutput(0), conv96->getOutput(0),conv95->getOutput(0), conv94->getOutput(0), conv93->getOutput(0), conv92->getOutput(0), conv91->getOutput(0), conv90->getOutput(0),conv89->getOutput(0), conv88->getOutput(0) }; IConcatenationLayer* concat98 = network->addConcatenation(input_tensor_98, 10); IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *concat98->getOutput(0), 384, 1, 1, 0, "model.99"); IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *conv99->getOutput(0), 192, 1, 1, 0, "model.100"); IResizeLayer* re101 = network->addResize(*conv100->getOutput(0)); re101->setResizeMode(ResizeMode::kNEAREST); re101->setScales(scale, 3); IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *conv27->getOutput(0), 192, 1, 1, 0, "model.102"); ITensor* input_tensor_103[] = { conv102->getOutput(0), re101->getOutput(0) }; IConcatenationLayer* concat103 = network->addConcatenation(input_tensor_103, 2); IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *concat103->getOutput(0), 128, 1, 1, 0, "model.104"); IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *concat103->getOutput(0), 128, 1, 1, 0, "model.105"); IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 64, 3, 1, 1, "model.106"); IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *conv106->getOutput(0), 64, 3, 1, 1, "model.107"); IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 64, 3, 1, 1, "model.108"); IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *conv108->getOutput(0), 64, 3, 1, 1, "model.109"); IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *conv109->getOutput(0), 64, 3, 1, 1, "model.110"); IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *conv110->getOutput(0), 64, 3, 1, 1, "model.111"); IElementWiseLayer* conv112 = convBnSilu(network, weightMap, *conv111->getOutput(0), 64, 3, 1, 1, "model.112"); IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *conv112->getOutput(0), 64, 3, 1, 1, "model.113"); ITensor* input_tensor_114[] = { conv113->getOutput(0), conv112->getOutput(0),conv111->getOutput(0), conv110->getOutput(0), conv109->getOutput(0), conv108->getOutput(0), conv107->getOutput(0), conv106->getOutput(0), conv105->getOutput(0), conv104->getOutput(0) }; IConcatenationLayer* concat114 = network->addConcatenation(input_tensor_114, 10); IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *concat114->getOutput(0), 192, 1, 1, 0, "model.115"); auto conv116 = DownC(network, weightMap, *conv115->getOutput(0), 192, 384, "model.116"); ITensor* input_tensor_117[] = { conv116->getOutput(0), conv99->getOutput(0) }; IConcatenationLayer* concat117 = network->addConcatenation(input_tensor_117, 2); IElementWiseLayer* conv118 = convBnSilu(network, weightMap, *concat117->getOutput(0), 256, 1, 1, 0, "model.118"); IElementWiseLayer* conv119 = convBnSilu(network, weightMap, *concat117->getOutput(0), 256, 1, 1, 0, "model.119"); IElementWiseLayer* conv120 = convBnSilu(network, weightMap, *conv119->getOutput(0), 128, 3, 1, 1, "model.120"); IElementWiseLayer* conv121 = convBnSilu(network, weightMap, *conv120->getOutput(0), 128, 3, 1, 1, "model.121"); IElementWiseLayer* conv122 = convBnSilu(network, weightMap, *conv121->getOutput(0), 128, 3, 1, 1, "model.122"); IElementWiseLayer* conv123 = convBnSilu(network, weightMap, *conv122->getOutput(0), 128, 3, 1, 1, "model.123"); IElementWiseLayer* conv124 = convBnSilu(network, weightMap, *conv123->getOutput(0), 128, 3, 1, 1, "model.124"); IElementWiseLayer* conv125 = convBnSilu(network, weightMap, *conv124->getOutput(0), 128, 3, 1, 1, "model.125"); IElementWiseLayer* conv126 = convBnSilu(network, weightMap, *conv125->getOutput(0), 128, 3, 1, 1, "model.126"); IElementWiseLayer* conv127 = convBnSilu(network, weightMap, *conv126->getOutput(0), 128, 3, 1, 1, "model.127"); ITensor* input_tensor_128[] = { conv127->getOutput(0), conv126->getOutput(0),conv125->getOutput(0), conv124->getOutput(0), conv123->getOutput(0), conv122->getOutput(0), conv121->getOutput(0), conv120->getOutput(0), conv119->getOutput(0), conv118->getOutput(0) }; IConcatenationLayer* concat128 = network->addConcatenation(input_tensor_128, 10); IElementWiseLayer* conv129 = convBnSilu(network, weightMap, *concat128->getOutput(0), 384, 1, 1, 0, "model.129"); auto conv130 = DownC(network, weightMap, *conv129->getOutput(0), 384, 576, "model.130"); ITensor* input_tensor_131[] = { conv130->getOutput(0), conv83->getOutput(0) }; IConcatenationLayer* concat131 = network->addConcatenation(input_tensor_131, 2); IElementWiseLayer* conv132 = convBnSilu(network, weightMap, *concat131->getOutput(0), 384, 1, 1, 0, "model.132"); IElementWiseLayer* conv133 = convBnSilu(network, weightMap, *concat131->getOutput(0), 384, 1, 1, 0, "model.133"); IElementWiseLayer* conv134 = convBnSilu(network, weightMap, *conv133->getOutput(0), 192, 3, 1, 1, "model.134"); IElementWiseLayer* conv135 = convBnSilu(network, weightMap, *conv134->getOutput(0), 192, 3, 1, 1, "model.135"); IElementWiseLayer* conv136 = convBnSilu(network, weightMap, *conv135->getOutput(0), 192, 3, 1, 1, "model.136"); IElementWiseLayer* conv137 = convBnSilu(network, weightMap, *conv136->getOutput(0), 192, 3, 1, 1, "model.137"); IElementWiseLayer* conv138 = convBnSilu(network, weightMap, *conv137->getOutput(0), 192, 3, 1, 1, "model.138"); IElementWiseLayer* conv139 = convBnSilu(network, weightMap, *conv138->getOutput(0), 192, 3, 1, 1, "model.139"); IElementWiseLayer* conv140 = convBnSilu(network, weightMap, *conv139->getOutput(0), 192, 3, 1, 1, "model.140"); IElementWiseLayer* conv141 = convBnSilu(network, weightMap, *conv140->getOutput(0), 192, 3, 1, 1, "model.141"); ITensor* input_tensor_142[] = { conv141->getOutput(0), conv140->getOutput(0),conv139->getOutput(0), conv138->getOutput(0), conv137->getOutput(0), conv136->getOutput(0), conv135->getOutput(0), conv134->getOutput(0), conv133->getOutput(0), conv132->getOutput(0) }; IConcatenationLayer* concat142 = network->addConcatenation(input_tensor_142, 10); IElementWiseLayer* conv143 = convBnSilu(network, weightMap, *concat142->getOutput(0), 576, 1, 1, 0, "model.143"); auto conv144 = DownC(network, weightMap, *conv143->getOutput(0), 576, 768, "model.144"); ITensor* input_tensor_145[] = { conv144->getOutput(0), conv67->getOutput(0) }; IConcatenationLayer* concat145 = network->addConcatenation(input_tensor_145, 2); IElementWiseLayer* conv146 = convBnSilu(network, weightMap, *concat145->getOutput(0), 512, 1, 1, 0, "model.146"); IElementWiseLayer* conv147 = convBnSilu(network, weightMap, *concat145->getOutput(0), 512, 1, 1, 0, "model.147"); IElementWiseLayer* conv148 = convBnSilu(network, weightMap, *conv147->getOutput(0), 256, 3, 1, 1, "model.148"); IElementWiseLayer* conv149 = convBnSilu(network, weightMap, *conv148->getOutput(0), 256, 3, 1, 1, "model.149"); IElementWiseLayer* conv150 = convBnSilu(network, weightMap, *conv149->getOutput(0), 256, 3, 1, 1, "model.150"); IElementWiseLayer* conv151 = convBnSilu(network, weightMap, *conv150->getOutput(0), 256, 3, 1, 1, "model.151"); IElementWiseLayer* conv152 = convBnSilu(network, weightMap, *conv151->getOutput(0), 256, 3, 1, 1, "model.152"); IElementWiseLayer* conv153 = convBnSilu(network, weightMap, *conv152->getOutput(0), 256, 3, 1, 1, "model.153"); IElementWiseLayer* conv154 = convBnSilu(network, weightMap, *conv153->getOutput(0), 256, 3, 1, 1, "model.154"); IElementWiseLayer* conv155 = convBnSilu(network, weightMap, *conv154->getOutput(0), 256, 3, 1, 1, "model.155"); ITensor* input_tensor_156[] = { conv155->getOutput(0), conv154->getOutput(0),conv153->getOutput(0), conv152->getOutput(0), conv151->getOutput(0), conv150->getOutput(0), conv149->getOutput(0), conv148->getOutput(0),conv147->getOutput(0), conv146->getOutput(0) }; IConcatenationLayer* concat156 = network->addConcatenation(input_tensor_156, 10); IElementWiseLayer* conv157 = convBnSilu(network, weightMap, *concat156->getOutput(0), 768, 1, 1, 0, "model.157"); IElementWiseLayer* conv158= convBnSilu(network, weightMap, *conv115->getOutput(0), 384, 3, 1, 1, "model.158"); IElementWiseLayer* conv159 = convBnSilu(network, weightMap, *conv129->getOutput(0), 768, 3, 1, 1, "model.159"); IElementWiseLayer* conv160 = convBnSilu(network, weightMap, *conv143->getOutput(0), 1152, 3, 1, 1, "model.160"); IElementWiseLayer* conv161 = convBnSilu(network, weightMap, *conv157->getOutput(0), 1536, 3, 1, 1, "model.161"); // out IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv158->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.0.weight"], weightMap["model.162.m.0.bias"]); assert(cv105_0); cv105_0->setName("cv105.0"); IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv159->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.1.weight"], weightMap["model.162.m.1.bias"]); assert(cv105_1); cv105_1->setName("cv105.1"); IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv160->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.2.weight"], weightMap["model.162.m.2.bias"]); assert(cv105_2); cv105_2->setName("cv105.2"); IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv161->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.162.m.3.weight"], weightMap["model.162.m.3.bias"]); assert(cv105_3); cv105_3->setName("cv105.3"); /*------------detect-----------*/ auto yolo = addYoLoLayer(network, weightMap, "model.162", std::vector{cv105_0, cv105_1, cv105_2, cv105_3}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov7e6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) { std::map weightMap = loadWeights(wts_path); INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); /*----------------------------------yolov7e6 backbone-----------------------------------------*/ auto* conv0 = ReOrg(network, weightMap, *data, 3); IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 80, 3, 1, 1, "model.1"); auto conv2 = DownC(network, weightMap, *conv1->getOutput(0), 80, 160, "model.2"); IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3"); IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4"); IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5"); IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6"); IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7"); IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8"); IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9"); IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10"); ITensor* input_tensor_11[] = { conv10->getOutput(0), conv8->getOutput(0),conv6->getOutput(0), conv4->getOutput(0),conv3->getOutput(0) }; IConcatenationLayer* concat11 = network->addConcatenation(input_tensor_11, 5); IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *concat11->getOutput(0), 160, 1, 1, 0, "model.12"); auto conv13 = DownC(network, weightMap, *conv12->getOutput(0), 160, 320, "model.13"); IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv13->getOutput(0), 128, 1, 1, 0, "model.14"); IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv13->getOutput(0), 128, 1, 1, 0, "model.15"); IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 3, 1, 1, "model.16"); IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 128, 3, 1, 1, "model.17"); IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *conv17->getOutput(0), 128, 3, 1, 1, "model.18"); IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19"); IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 3, 1, 1, "model.20"); IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21"); ITensor* input_tensor_22[] = { conv21->getOutput(0), conv19->getOutput(0),conv17->getOutput(0), conv15->getOutput(0),conv14->getOutput(0) }; IConcatenationLayer* concat22 = network->addConcatenation(input_tensor_22, 5); IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *concat22->getOutput(0), 320, 1, 1, 0, "model.23"); auto conv24 = DownC(network, weightMap, *conv23->getOutput(0), 320, 640, "model.24"); IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 1, 1, 0, "model.25"); IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 1, 1, 0, "model.26"); IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *conv26->getOutput(0), 256, 3, 1, 1, "model.27"); IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *conv27->getOutput(0), 256, 3, 1, 1, "model.28"); IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 3, 1, 1, "model.29"); IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv29->getOutput(0), 256, 3, 1, 1, "model.30"); IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv30->getOutput(0), 256, 3, 1, 1, "model.31"); IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 256, 3, 1, 1, "model.32"); ITensor* input_tensor_33[] = { conv32->getOutput(0), conv30->getOutput(0),conv28->getOutput(0), conv26->getOutput(0),conv25->getOutput(0) }; IConcatenationLayer* concat33 = network->addConcatenation(input_tensor_33, 5); IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *concat33->getOutput(0), 640, 1, 1, 0, "model.34"); auto conv35 = DownC(network, weightMap, *conv34->getOutput(0), 640, 960, "model.35"); IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv35->getOutput(0), 384, 1, 1, 0, "model.36"); IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv35->getOutput(0), 384, 1, 1, 0, "model.37"); IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 384, 3, 1, 1, "model.38"); IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 384, 3, 1, 1, "model.39"); IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv39->getOutput(0), 384, 3, 1, 1, "model.40"); IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 384, 3, 1, 1, "model.41"); IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 384, 3, 1, 1, "model.42"); IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *conv42->getOutput(0), 384, 3, 1, 1, "model.43"); ITensor* input_tensor_44[] = { conv43->getOutput(0), conv41->getOutput(0),conv39->getOutput(0), conv37->getOutput(0),conv36->getOutput(0) }; IConcatenationLayer* concat44 = network->addConcatenation(input_tensor_44, 5); IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *concat44->getOutput(0), 960, 1, 1, 0, "model.45"); auto conv46 = DownC(network, weightMap, *conv45->getOutput(0), 960, 1280, "model.46"); IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 512, 1, 1, 0, "model.47"); IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv46->getOutput(0), 512, 1, 1, 0, "model.48"); IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *conv48->getOutput(0), 512, 3, 1, 1, "model.49"); IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv49->getOutput(0), 512, 3, 1, 1, "model.50"); IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 512, 3, 1, 1, "model.51"); IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 512, 3, 1, 1, "model.52"); IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *conv52->getOutput(0), 512, 3, 1, 1, "model.53"); IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 512, 3, 1, 1, "model.54"); ITensor* input_tensor_55[] = { conv54->getOutput(0), conv52->getOutput(0),conv50->getOutput(0), conv48->getOutput(0),conv47->getOutput(0) }; IConcatenationLayer* concat55 = network->addConcatenation(input_tensor_55, 5); IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *concat55->getOutput(0), 1280, 1, 1, 0, "model.56"); //------------------------yolov7e6 head------------------------------- auto conv57 = SPPCSPC(network, weightMap, *conv56->getOutput(0), 640, "model.57"); IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv57->getOutput(0), 480, 1, 1, 0, "model.58"); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* re59 = network->addResize(*conv58->getOutput(0)); re59->setResizeMode(ResizeMode::kNEAREST); re59->setScales(scale, 3); IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv45->getOutput(0), 480, 1, 1, 0, "model.60"); ITensor* input_tensor_61[] = { conv60->getOutput(0), re59->getOutput(0) }; IConcatenationLayer* concat61 = network->addConcatenation(input_tensor_61, 2); IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *concat61->getOutput(0), 384, 1, 1, 0, "model.62"); IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *concat61->getOutput(0), 384, 1, 1, 0, "model.63"); IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 192, 3, 1, 1, "model.64"); IElementWiseLayer* conv65 = convBnSilu(network, weightMap, *conv64->getOutput(0), 192, 3, 1, 1, "model.65"); IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv65->getOutput(0), 192, 3, 1, 1, "model.66"); IElementWiseLayer* conv67 = convBnSilu(network, weightMap, *conv66->getOutput(0), 192, 3, 1, 1, "model.67"); IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 192, 3, 1, 1, "model.68"); IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 192, 3, 1, 1, "model.69"); ITensor* input_tensor_70[] = { conv69->getOutput(0), conv68->getOutput(0),conv67->getOutput(0), conv66->getOutput(0), conv65->getOutput(0), conv64->getOutput(0), conv63->getOutput(0), conv62->getOutput(0) }; IConcatenationLayer* concat70 = network->addConcatenation(input_tensor_70, 8); IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *concat70->getOutput(0), 480, 1, 1, 0, "model.71"); IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 320, 1, 1, 0, "model.72"); IResizeLayer* re73 = network->addResize(*conv72->getOutput(0)); re73->setResizeMode(ResizeMode::kNEAREST); re73->setScales(scale, 3); IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv34->getOutput(0), 320, 1, 1, 0, "model.74"); ITensor* input_tensor_75[] = { conv74->getOutput(0), re73->getOutput(0) }; IConcatenationLayer* concat75 = network->addConcatenation(input_tensor_75, 2); IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *concat75->getOutput(0), 256, 1, 1, 0, "model.76"); IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *concat75->getOutput(0), 256, 1, 1, 0, "model.77"); IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv77->getOutput(0), 128, 3, 1, 1, "model.78"); IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 128, 3, 1, 1, "model.79"); IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 128, 3, 1, 1, "model.80"); IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 128, 3, 1, 1, "model.81"); IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *conv81->getOutput(0), 128, 3, 1, 1, "model.82"); IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 128, 3, 1, 1, "model.83"); ITensor* input_tensor_84[] = { conv83->getOutput(0), conv82->getOutput(0),conv81->getOutput(0), conv80->getOutput(0), conv79->getOutput(0), conv78->getOutput(0), conv77->getOutput(0), conv76->getOutput(0) }; IConcatenationLayer* concat84 = network->addConcatenation(input_tensor_84, 8); IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *concat84->getOutput(0), 320, 1, 1, 0, "model.85"); IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv85->getOutput(0), 160, 1, 1, 0, "model.86"); IResizeLayer* re87 = network->addResize(*conv86->getOutput(0)); re87->setResizeMode(ResizeMode::kNEAREST); re87->setScales(scale, 3); IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *conv23->getOutput(0), 160, 1, 1, 0, "model.88"); ITensor* input_tensor_89[] = { conv88->getOutput(0), re87->getOutput(0) }; IConcatenationLayer* concat89 = network->addConcatenation(input_tensor_89, 2); IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *concat89->getOutput(0), 128, 1, 1, 0, "model.90"); IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *concat89->getOutput(0), 128, 1, 1, 0, "model.91"); IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv91->getOutput(0), 64, 3, 1, 1, "model.92"); IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *conv92->getOutput(0), 64, 3, 1, 1, "model.93"); IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 64, 3, 1, 1, "model.94"); IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 64, 3, 1, 1, "model.95"); IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 64, 3, 1, 1, "model.96"); IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 64, 3, 1, 1, "model.97"); ITensor* input_tensor_98[] = { conv97->getOutput(0), conv96->getOutput(0),conv95->getOutput(0), conv94->getOutput(0), conv93->getOutput(0), conv92->getOutput(0), conv91->getOutput(0), conv90->getOutput(0) }; IConcatenationLayer* concat98 = network->addConcatenation(input_tensor_98, 8); IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *concat98->getOutput(0), 160, 1, 1, 0, "model.99"); auto conv100 = DownC(network, weightMap, *conv99->getOutput(0), 160, 320, "model.100"); ITensor* input_tensor_101[] = { conv100->getOutput(0), conv85->getOutput(0) }; IConcatenationLayer* concat101 = network->addConcatenation(input_tensor_101, 2); IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *concat101->getOutput(0), 256, 1, 1, 0, "model.102"); IElementWiseLayer* conv103 = convBnSilu(network, weightMap, *concat101->getOutput(0), 256, 1, 1, 0, "model.103"); IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *conv103->getOutput(0), 128, 3, 1, 1, "model.104"); IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *conv104->getOutput(0), 128, 3, 1, 1, "model.105"); IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 128, 3, 1, 1, "model.106"); IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *conv106->getOutput(0), 128, 3, 1, 1, "model.107"); IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 128, 3, 1, 1, "model.108"); IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *conv108->getOutput(0), 128, 3, 1, 1, "model.109"); ITensor* input_tensor_110[] = { conv109->getOutput(0), conv108->getOutput(0),conv107->getOutput(0), conv106->getOutput(0), conv105->getOutput(0), conv104->getOutput(0), conv103->getOutput(0), conv102->getOutput(0) }; IConcatenationLayer* concat110 = network->addConcatenation(input_tensor_110, 8); IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *concat110->getOutput(0), 320, 1, 1, 0, "model.111"); auto conv112 = DownC(network, weightMap, *conv111->getOutput(0), 320, 480, "model.112"); ITensor* input_tensor_113[] = { conv112->getOutput(0), conv71->getOutput(0) }; IConcatenationLayer* concat113 = network->addConcatenation(input_tensor_113, 2); IElementWiseLayer* conv114 = convBnSilu(network, weightMap, *concat113->getOutput(0), 384, 1, 1, 0, "model.114"); IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *concat113->getOutput(0), 384, 1, 1, 0, "model.115"); IElementWiseLayer* conv116 = convBnSilu(network, weightMap, *conv115->getOutput(0), 192, 3, 1, 1, "model.116"); IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *conv116->getOutput(0), 192, 3, 1, 1, "model.117"); IElementWiseLayer* conv118 = convBnSilu(network, weightMap, *conv117->getOutput(0), 192, 3, 1, 1, "model.118"); IElementWiseLayer* conv119 = convBnSilu(network, weightMap, *conv118->getOutput(0), 192, 3, 1, 1, "model.119"); IElementWiseLayer* conv120 = convBnSilu(network, weightMap, *conv119->getOutput(0), 192, 3, 1, 1, "model.120"); IElementWiseLayer* conv121 = convBnSilu(network, weightMap, *conv120->getOutput(0), 192, 3, 1, 1, "model.121"); ITensor* input_tensor_122[] = { conv121->getOutput(0), conv120->getOutput(0),conv119->getOutput(0), conv118->getOutput(0), conv117->getOutput(0), conv116->getOutput(0), conv115->getOutput(0), conv114->getOutput(0) }; IConcatenationLayer* concat122 = network->addConcatenation(input_tensor_122, 8); IElementWiseLayer* conv123 = convBnSilu(network, weightMap, *concat122->getOutput(0), 480, 1, 1, 0, "model.123"); auto conv124 = DownC(network, weightMap, *conv123->getOutput(0), 480, 640, "model.124"); ITensor* input_tensor_125[] = { conv124->getOutput(0), conv57->getOutput(0) }; IConcatenationLayer* concat125 = network->addConcatenation(input_tensor_125, 2); IElementWiseLayer* conv126 = convBnSilu(network, weightMap, *concat125->getOutput(0), 512, 1, 1, 0, "model.126"); IElementWiseLayer* conv127 = convBnSilu(network, weightMap, *concat125->getOutput(0), 512, 1, 1, 0, "model.127"); IElementWiseLayer* conv128 = convBnSilu(network, weightMap, *conv127->getOutput(0), 256, 3, 1, 1, "model.128"); IElementWiseLayer* conv129 = convBnSilu(network, weightMap, *conv128->getOutput(0), 256, 3, 1, 1, "model.129"); IElementWiseLayer* conv130 = convBnSilu(network, weightMap, *conv129->getOutput(0), 256, 3, 1, 1, "model.130"); IElementWiseLayer* conv131 = convBnSilu(network, weightMap, *conv130->getOutput(0), 256, 3, 1, 1, "model.131"); IElementWiseLayer* conv132 = convBnSilu(network, weightMap, *conv131->getOutput(0), 256, 3, 1, 1, "model.132"); IElementWiseLayer* conv133 = convBnSilu(network, weightMap, *conv132->getOutput(0), 256, 3, 1, 1, "model.133"); ITensor* input_tensor_134[] = { conv133->getOutput(0), conv132->getOutput(0),conv131->getOutput(0), conv130->getOutput(0), conv129->getOutput(0), conv128->getOutput(0), conv127->getOutput(0), conv126->getOutput(0) }; IConcatenationLayer* concat134 = network->addConcatenation(input_tensor_134, 8); IElementWiseLayer* conv135 = convBnSilu(network, weightMap, *concat134->getOutput(0), 640, 1, 1, 0, "model.135"); IElementWiseLayer* conv136 = convBnSilu(network, weightMap, *conv99->getOutput(0), 320, 3, 1, 1, "model.136"); IElementWiseLayer* conv137 = convBnSilu(network, weightMap, *conv111->getOutput(0), 640, 3, 1, 1, "model.137"); IElementWiseLayer* conv138 = convBnSilu(network, weightMap, *conv123->getOutput(0), 960, 3, 1, 1, "model.138"); IElementWiseLayer* conv139 = convBnSilu(network, weightMap, *conv135->getOutput(0), 1280, 3, 1, 1, "model.139"); // out IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv136->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.0.weight"], weightMap["model.140.m.0.bias"]); assert(cv105_0); cv105_0->setName("cv105.0"); IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv137->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.1.weight"], weightMap["model.140.m.1.bias"]); assert(cv105_1); cv105_1->setName("cv105.1"); IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv138->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.2.weight"], weightMap["model.140.m.2.bias"]); assert(cv105_2); cv105_2->setName("cv105.2"); IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv139->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.140.m.3.weight"], weightMap["model.140.m.3.bias"]); assert(cv105_3); cv105_3->setName("cv105.3"); /*------------detect-----------*/ auto yolo = addYoLoLayer(network, weightMap, "model.140", std::vector{cv105_0, cv105_1, cv105_2, cv105_3}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov7w6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) { std::map weightMap = loadWeights(wts_path); INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); /*----------------------------------yolov7w6 backbone-----------------------------------------*/ auto* conv0 = ReOrg(network, weightMap, *data, 3); IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 64, 3, 1, 1, "model.1"); IElementWiseLayer* conv2 = convBnSilu(network, weightMap, *conv1->getOutput(0), 128, 3, 2, 1, "model.2"); IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.3"); IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv2->getOutput(0), 64, 1, 1, 0, "model.4"); IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv4->getOutput(0), 64, 3, 1, 1, "model.5"); IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6"); IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7"); IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8"); ITensor* input_tensor_9[] = { conv8->getOutput(0), conv6->getOutput(0), conv4->getOutput(0), conv3->getOutput(0) }; IConcatenationLayer* concat9 = network->addConcatenation(input_tensor_9, 4); concat9->setAxis(0); IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *concat9->getOutput(0), 128, 1, 1, 0, "model.10"); IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *conv10->getOutput(0), 256, 3, 2, 1, "model.11"); IElementWiseLayer* conv12 = convBnSilu(network, weightMap, *conv11->getOutput(0), 128, 1, 1, 0, "model.12"); IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *conv11->getOutput(0), 128, 1, 1, 0, "model.13"); IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv13->getOutput(0), 128, 3, 1, 1, "model.14"); IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv14->getOutput(0), 128, 3, 1, 1, "model.15"); IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv15->getOutput(0), 128, 3, 1, 1, "model.16"); IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 128, 3, 1, 1, "model.17"); ITensor* input_tensor_18[] = { conv17->getOutput(0), conv15->getOutput(0), conv13->getOutput(0), conv12->getOutput(0) }; IConcatenationLayer* concat18 = network->addConcatenation(input_tensor_18, 4); concat18->setAxis(0); IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *concat18->getOutput(0), 256, 1, 1, 0, "model.19"); IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 512, 3, 2, 1, "model.20"); IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 256, 1, 1, 0, "model.21"); IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv20->getOutput(0), 256, 1, 1, 0, "model.22"); IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *conv22->getOutput(0), 256, 3, 1, 1, "model.23"); IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *conv23->getOutput(0), 256, 3, 1, 1, "model.24"); IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 3, 1, 1, "model.25"); IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv25->getOutput(0), 256, 3, 1, 1, "model.26"); ITensor* input_tensor_27[] = { conv26->getOutput(0), conv24->getOutput(0), conv22->getOutput(0), conv21->getOutput(0) }; IConcatenationLayer* concat27 = network->addConcatenation(input_tensor_27, 4); concat27->setAxis(0); IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *concat27->getOutput(0), 512, 1, 1, 0, "model.28"); IElementWiseLayer* conv29 = convBnSilu(network, weightMap, *conv28->getOutput(0), 768, 3, 2, 1, "model.29"); IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *conv29->getOutput(0), 384, 1, 1, 0, "model.30"); IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv29->getOutput(0), 384, 1, 1, 0, "model.31"); IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 384, 3, 1, 1, "model.32"); IElementWiseLayer* conv33 = convBnSilu(network, weightMap, *conv32->getOutput(0), 384, 3, 1, 1, "model.33"); IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *conv33->getOutput(0), 384, 3, 1, 1, "model.34"); IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv34->getOutput(0), 384, 3, 1, 1, "model.35"); ITensor* input_tensor_36[] = { conv35->getOutput(0), conv33->getOutput(0), conv31->getOutput(0), conv30->getOutput(0) }; IConcatenationLayer* concat36 = network->addConcatenation(input_tensor_36, 4); IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *concat36->getOutput(0), 768, 1, 1, 0, "model.37"); IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 1024, 3, 2, 1, "model.38"); IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 512, 1, 1, 0, "model.39"); IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv38->getOutput(0), 512, 1, 1, 0, "model.40"); IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 512, 3, 1, 1, "model.41"); IElementWiseLayer* conv42 = convBnSilu(network, weightMap, *conv41->getOutput(0), 512, 3, 1, 1, "model.42"); IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *conv42->getOutput(0), 512, 3, 1, 1, "model.43"); IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *conv43->getOutput(0), 512, 3, 1, 1, "model.44"); ITensor* input_tensor_45[] = { conv44->getOutput(0), conv42->getOutput(0), conv40->getOutput(0), conv39->getOutput(0) }; IConcatenationLayer* concat45 = network->addConcatenation(input_tensor_45, 4); IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *concat45->getOutput(0), 1024, 1, 1, 0, "model.46"); auto conv47 = SPPCSPC(network, weightMap, *conv46->getOutput(0), 512, "model.47"); IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv47->getOutput(0), 384, 1, 1, 0, "model.48"); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* re49 = network->addResize(*conv48->getOutput(0)); re49->setResizeMode(ResizeMode::kNEAREST); re49->setScales(scale, 3); IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *conv37->getOutput(0), 384, 1, 1, 0, "model.50"); ITensor* input_tensor_51[] = { conv50->getOutput(0), re49->getOutput(0) }; IConcatenationLayer* concat51 = network->addConcatenation(input_tensor_51, 2); IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *concat51->getOutput(0), 384, 1, 1, 0, "model.52"); IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *concat51->getOutput(0), 384, 1, 1, 0, "model.53"); IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 192, 3, 1, 1, "model.54"); IElementWiseLayer* conv55 = convBnSilu(network, weightMap, *conv54->getOutput(0), 192, 3, 1, 1, "model.55"); IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *conv55->getOutput(0), 192, 3, 1, 1, "model.56"); IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *conv56->getOutput(0), 192, 3, 1, 1, "model.57"); ITensor* input_tensor_58[] = { conv57->getOutput(0), conv56->getOutput(0), conv55->getOutput(0), conv54->getOutput(0), conv53->getOutput(0), conv52->getOutput(0) }; IConcatenationLayer* concat58 = network->addConcatenation(input_tensor_58, 6); IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *concat58->getOutput(0), 384, 1, 1, 0, "model.59"); IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 256, 1, 1, 0, "model.60"); IResizeLayer* re61 = network->addResize(*conv60->getOutput(0)); re61->setResizeMode(ResizeMode::kNEAREST); re61->setScales(scale, 3); IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.62"); ITensor* input_tensor_63[] = { conv62->getOutput(0), re61->getOutput(0) }; IConcatenationLayer* concat63 = network->addConcatenation(input_tensor_63, 2); IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.64"); IElementWiseLayer* conv65 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.65"); IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv65->getOutput(0), 128, 3, 1, 1, "model.66"); IElementWiseLayer* conv67 = convBnSilu(network, weightMap, *conv66->getOutput(0), 128, 3, 1, 1, "model.67"); IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 128, 3, 1, 1, "model.68"); IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 128, 3, 1, 1, "model.69"); ITensor* input_tensor_70[] = { conv69->getOutput(0), conv68->getOutput(0), conv67->getOutput(0), conv66->getOutput(0), conv65->getOutput(0), conv64->getOutput(0) }; IConcatenationLayer* concat70 = network->addConcatenation(input_tensor_70, 6); IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *concat70->getOutput(0), 256, 1, 1, 0, "model.71"); IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 128, 1, 1, 0, "model.72"); IResizeLayer* re73 = network->addResize(*conv72->getOutput(0)); re73->setResizeMode(ResizeMode::kNEAREST); re73->setScales(scale, 3); IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 1, 1, 0, "model.74"); ITensor* input_tensor_75[] = { conv74->getOutput(0), re73->getOutput(0) }; IConcatenationLayer* concat75 = network->addConcatenation(input_tensor_75, 2); IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *concat75->getOutput(0), 128, 1, 1, 0, "model.76"); IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *concat75->getOutput(0), 128, 1, 1, 0, "model.77"); IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv77->getOutput(0), 64, 3, 1, 1, "model.78"); IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 64, 3, 1, 1, "model.79"); IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 64, 3, 1, 1, "model.80"); IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 64, 3, 1, 1, "model.81"); ITensor* input_tensor_82[] = { conv81->getOutput(0), conv80->getOutput(0), conv79->getOutput(0), conv78->getOutput(0), conv77->getOutput(0), conv76->getOutput(0) }; IConcatenationLayer* concat82 = network->addConcatenation(input_tensor_82, 6); IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *concat82->getOutput(0), 128, 1, 1, 0, "model.83"); IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 256, 3, 2, 1, "model.84"); ITensor* input_tensor_85[] = { conv84->getOutput(0), conv71->getOutput(0) }; IConcatenationLayer* concat85 = network->addConcatenation(input_tensor_85, 2); IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *concat85->getOutput(0), 256, 1, 1, 0, "model.86"); IElementWiseLayer* conv87 = convBnSilu(network, weightMap, *concat85->getOutput(0), 256, 1, 1, 0, "model.87"); IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *conv87->getOutput(0), 128, 3, 1, 1, "model.88"); IElementWiseLayer* conv89 = convBnSilu(network, weightMap, *conv88->getOutput(0), 128, 3, 1, 1, "model.89"); IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *conv89->getOutput(0), 128, 3, 1, 1, "model.90"); IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 128, 3, 1, 1, "model.91"); ITensor* input_tensor_92[] = { conv91->getOutput(0), conv90->getOutput(0), conv89->getOutput(0), conv88->getOutput(0), conv87->getOutput(0), conv86->getOutput(0) }; IConcatenationLayer* concat92 = network->addConcatenation(input_tensor_92, 6); IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *concat92->getOutput(0), 256, 1, 1, 0, "model.93"); IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *conv93->getOutput(0), 384, 3, 2, 1, "model.94"); ITensor* input_tensor_95[] = { conv94->getOutput(0), conv59->getOutput(0) }; IConcatenationLayer* concat95 = network->addConcatenation(input_tensor_95, 2); IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *concat95->getOutput(0), 384, 1, 1, 0, "model.96"); IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *concat95->getOutput(0), 384, 1, 1, 0, "model.97"); IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 192, 3, 1, 1, "model.98"); IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *conv98->getOutput(0), 192, 3, 1, 1, "model.99"); IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *conv99->getOutput(0), 192, 3, 1, 1, "model.100"); IElementWiseLayer* conv101 = convBnSilu(network, weightMap, *conv100->getOutput(0), 192, 3, 1, 1, "model.101"); ITensor* input_tensor_102[] = { conv101->getOutput(0), conv100->getOutput(0), conv99->getOutput(0), conv98->getOutput(0), conv97->getOutput(0), conv96->getOutput(0) }; IConcatenationLayer* concat102 = network->addConcatenation(input_tensor_102, 6); IElementWiseLayer* conv103 = convBnSilu(network, weightMap, *concat102->getOutput(0), 384, 1, 1, 0, "model.103"); IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *conv103->getOutput(0), 512, 3, 2, 1, "model.104"); ITensor* input_tensor_105[] = { conv104->getOutput(0), conv47->getOutput(0) }; IConcatenationLayer* concat105 = network->addConcatenation(input_tensor_105, 2); IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *concat105->getOutput(0), 512, 1, 1, 0, "model.106"); IElementWiseLayer* conv107 = convBnSilu(network, weightMap, *concat105->getOutput(0), 512, 1, 1, 0, "model.107"); IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *conv107->getOutput(0), 256, 3, 1, 1, "model.108"); IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *conv108->getOutput(0), 256, 3, 1, 1, "model.109"); IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *conv109->getOutput(0), 256, 3, 1, 1, "model.110"); IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *conv110->getOutput(0), 256, 3, 1, 1, "model.111"); ITensor* input_tensor_112[] = { conv111->getOutput(0), conv110->getOutput(0), conv109->getOutput(0), conv108->getOutput(0), conv107->getOutput(0), conv106->getOutput(0) }; IConcatenationLayer* concat112 = network->addConcatenation(input_tensor_112, 6); IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *concat112->getOutput(0), 512, 1, 1, 0, "model.113"); IElementWiseLayer* conv114 = convBnSilu(network, weightMap, *conv83->getOutput(0), 256, 3, 1, 1, "model.114"); IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *conv93->getOutput(0), 512, 3, 1, 1, "model.115"); IElementWiseLayer* conv116 = convBnSilu(network, weightMap, *conv103->getOutput(0), 768, 3, 1, 1, "model.116"); IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *conv113->getOutput(0), 1024, 3, 1, 1, "model.117"); // out IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv114->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.0.weight"], weightMap["model.118.m.0.bias"]); assert(cv105_0); cv105_0->setName("cv105.0"); IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv115->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.1.weight"], weightMap["model.118.m.1.bias"]); assert(cv105_1); cv105_1->setName("cv105.1"); IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv116->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.2.weight"], weightMap["model.118.m.2.bias"]); assert(cv105_2); cv105_2->setName("cv105.2"); IConvolutionLayer* cv105_3 = network->addConvolutionNd(*conv117->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.118.m.3.weight"], weightMap["model.118.m.3.bias"]); assert(cv105_3); cv105_3->setName("cv105.3"); /*------------detect-----------*/ auto yolo = addYoLoLayer(network, weightMap, "model.118", std::vector{cv105_0, cv105_1, cv105_2, cv105_3}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov7x(unsigned int maxBatchSize,IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) { std::map weightMap = loadWeights(wts_path); INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); /*----------------------------------yolov7x backbone-----------------------------------------*/ IElementWiseLayer* conv0 = convBnSilu(network, weightMap, *data, 40, 3, 1, 1, "model.0"); IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 80, 3, 2, 1, "model.1"); IElementWiseLayer* conv2 = convBnSilu(network, weightMap, *conv1->getOutput(0), 80, 3, 1, 1, "model.2"); IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 160, 3, 2, 1, "model.3"); IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.4"); IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.5"); IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6"); IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7"); IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8"); IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9"); IElementWiseLayer* conv10 = convBnSilu(network, weightMap, *conv9->getOutput(0), 64, 3, 1, 1, "model.10"); IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *conv10->getOutput(0), 64, 3, 1, 1, "model.11"); ITensor* input_tensor_12[] = { conv11->getOutput(0), conv9->getOutput(0), conv7->getOutput(0), conv5->getOutput(0), conv4->getOutput(0) }; IConcatenationLayer* concat12 = network->addConcatenation(input_tensor_12, 5); //concat9->setAxis(0); IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *concat12->getOutput(0), 320, 1, 1, 0, "model.13"); IPoolingLayer* mp1 = network->addPoolingNd(*conv13->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp1->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *mp1->getOutput(0), 160, 1, 1, 0, "model.15"); IElementWiseLayer* conv16 = convBnSilu(network, weightMap, *conv13->getOutput(0), 160, 1, 1, 0, "model.16"); IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *conv16->getOutput(0), 160, 3, 2, 1, "model.17"); ITensor* input_tensor_18[] = { conv17->getOutput(0), conv15->getOutput(0) }; IConcatenationLayer* concat18 = network->addConcatenation(input_tensor_18, 2); //IConcatenationLayer* mp1 = MPC3(network, weightMap, *conv13->getOutput(0), 160, "model.15", "model.16", "model.17"); IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *concat18->getOutput(0), 128, 1, 1, 0, "model.19"); IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *concat18->getOutput(0), 128, 1, 1, 0, "model.20"); IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21"); IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv21->getOutput(0), 128, 3, 1, 1, "model.22"); IElementWiseLayer* conv23 = convBnSilu(network, weightMap, *conv22->getOutput(0), 128, 3, 1, 1, "model.23"); IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *conv23->getOutput(0), 128, 3, 1, 1, "model.24"); IElementWiseLayer* conv25 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 3, 1, 1, "model.25"); IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *conv25->getOutput(0), 128, 3, 1, 1, "model.26"); ITensor* input_tensor_27[] = { conv26->getOutput(0), conv24->getOutput(0), conv22->getOutput(0), conv20->getOutput(0),conv19->getOutput(0) }; IConcatenationLayer* concat27 = network->addConcatenation(input_tensor_27, 5); //concat9->setAxis(0); IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *concat27->getOutput(0), 640, 1, 1, 0, "model.28"); IPoolingLayer* mp2 = network->addPoolingNd(*conv28->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp2->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *mp2->getOutput(0), 320, 1, 1, 0, "model.30"); IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *conv28->getOutput(0), 320, 1, 1, 0, "model.31"); IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 320, 3, 2, 1, "model.32"); ITensor* input_tensor_33[] = { conv32->getOutput(0), conv30->getOutput(0) }; IConcatenationLayer* concat33 = network->addConcatenation(input_tensor_33, 2); //IConcatenationLayer* mp2 = MPC3(network, weightMap, *conv28->getOutput(0), 320, "model.30", "model.31", "model.32"); IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *concat33->getOutput(0), 256, 1, 1, 0, "model.34"); IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *concat33->getOutput(0), 256, 1, 1, 0, "model.35"); IElementWiseLayer* conv36 = convBnSilu(network, weightMap, *conv35->getOutput(0), 256, 3, 1, 1, "model.36"); IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *conv36->getOutput(0), 256, 3, 1, 1, "model.37"); IElementWiseLayer* conv38 = convBnSilu(network, weightMap, *conv37->getOutput(0), 256, 3, 1, 1, "model.38"); IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *conv38->getOutput(0), 256, 3, 1, 1, "model.39"); IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv39->getOutput(0), 256, 3, 1, 1, "model.40"); IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 256, 3, 1, 1, "model.41"); ITensor* input_tensor_42[] = { conv41->getOutput(0), conv39->getOutput(0), conv37->getOutput(0), conv35->getOutput(0),conv34->getOutput(0) }; IConcatenationLayer* concat42 = network->addConcatenation(input_tensor_42, 5); //concat9->setAxis(0); IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *concat42->getOutput(0), 1280, 1, 1, 0, "model.43"); IPoolingLayer* mp3 = network->addPoolingNd(*conv43->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp3->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *mp3->getOutput(0), 640, 1, 1, 0, "model.45"); IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *conv43->getOutput(0), 640, 1, 1, 0, "model.46"); IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 640, 3, 2, 1, "model.47"); ITensor* input_tensor_48[] = { conv47->getOutput(0), conv45->getOutput(0) }; IConcatenationLayer* concat48 = network->addConcatenation(input_tensor_48, 2); //IConcatenationLayer* mp3 = MPC3(network, weightMap, *conv43->getOutput(0), 640, "model.45", "model.46", "model.47"); IElementWiseLayer* conv49 = convBnSilu(network, weightMap, *concat48->getOutput(0), 256, 1, 1, 0, "model.49"); IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *concat48->getOutput(0), 256, 1, 1, 0, "model.50"); IElementWiseLayer* conv51 = convBnSilu(network, weightMap, *conv50->getOutput(0), 256, 3, 1, 1, "model.51"); IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 256, 3, 1, 1, "model.52"); IElementWiseLayer* conv53 = convBnSilu(network, weightMap, *conv52->getOutput(0), 256, 3, 1, 1, "model.53"); IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv53->getOutput(0), 256, 3, 1, 1, "model.54"); IElementWiseLayer* conv55 = convBnSilu(network, weightMap, *conv54->getOutput(0), 256, 3, 1, 1, "model.55"); IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *conv55->getOutput(0), 256, 3, 1, 1, "model.56"); ITensor* input_tensor_57[] = { conv56->getOutput(0), conv54->getOutput(0), conv52->getOutput(0), conv50->getOutput(0),conv49->getOutput(0) }; IConcatenationLayer* concat57 = network->addConcatenation(input_tensor_57, 5); //concat9->setAxis(0); IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *concat57->getOutput(0), 1280, 1, 1, 0, "model.58"); //-----------------------yolov7 head--------------------------- //-----SPPCSPC----------- IElementWiseLayer* conv59 = SPPCSPC(network, weightMap, *conv58->getOutput(0), 640, "model.59"); IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 320, 1, 1, 0, "model.60"); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* re61 = network->addResize(*conv60->getOutput(0)); re61->setResizeMode(ResizeMode::kNEAREST); re61->setScales(scale, 3); IElementWiseLayer* conv62 = convBnSilu(network, weightMap, *conv43->getOutput(0), 320, 1, 1, 0, "model.62"); ITensor* input_tensor_63[] = { conv62->getOutput(0), re61->getOutput(0) }; IConcatenationLayer* concat63 = network->addConcatenation(input_tensor_63, 2); //concat63->setAxis(0); IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.64"); IElementWiseLayer* conv65 = convBnSilu(network, weightMap, *concat63->getOutput(0), 256, 1, 1, 0, "model.65"); IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv65->getOutput(0), 256, 3, 1, 1, "model.66"); IElementWiseLayer* conv67 = convBnSilu(network, weightMap, *conv66->getOutput(0), 256, 3, 1, 1, "model.67"); IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *conv67->getOutput(0), 256, 3, 1, 1, "model.68"); IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *conv68->getOutput(0), 256, 3, 1, 1, "model.69"); IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv69->getOutput(0), 256, 3, 1, 1, "model.70"); IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *conv70->getOutput(0), 256, 3, 1, 1, "model.71"); ITensor* input_tensor_72[] = { conv71->getOutput(0), conv69->getOutput(0), conv67->getOutput(0), conv65->getOutput(0),conv64->getOutput(0) }; IConcatenationLayer* concat72 = network->addConcatenation(input_tensor_72, 5); //concat9->setAxis(0); IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *concat72->getOutput(0), 320, 1, 1, 0, "model.73"); IElementWiseLayer* conv74 = convBnSilu(network, weightMap, *conv73->getOutput(0), 160, 1, 1, 0, "model.74"); IResizeLayer* re75 = network->addResize(*conv74->getOutput(0)); re75->setResizeMode(ResizeMode::kNEAREST); re75->setScales(scale, 3); IElementWiseLayer* conv76 = convBnSilu(network, weightMap, *conv28->getOutput(0), 160, 1, 1, 0, "model.76"); ITensor* input_tensor_77[] = { conv76->getOutput(0), re75->getOutput(0) }; IConcatenationLayer* concat77 = network->addConcatenation(input_tensor_77, 2); IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *concat77->getOutput(0), 128, 1, 1, 0, "model.78"); IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *concat77->getOutput(0), 128, 1, 1, 0, "model.79"); IElementWiseLayer* conv80 = convBnSilu(network, weightMap, *conv79->getOutput(0), 128, 3, 1, 1, "model.80"); IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *conv80->getOutput(0), 128, 3, 1, 1, "model.81"); IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *conv81->getOutput(0), 128, 3, 1, 1, "model.82"); IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 128, 3, 1, 1, "model.83"); IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 128, 3, 1, 1, "model.84"); IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *conv84->getOutput(0), 128, 3, 1, 1, "model.85"); ITensor* input_tensor_86[] = { conv85->getOutput(0), conv83->getOutput(0), conv81->getOutput(0), conv79->getOutput(0),conv78->getOutput(0) }; IConcatenationLayer* concat86 = network->addConcatenation(input_tensor_86, 5); //concat9->setAxis(0); IElementWiseLayer* conv87 = convBnSilu(network, weightMap, *concat86->getOutput(0), 160, 1, 1, 0, "model.87"); IPoolingLayer* mp88 = network->addPoolingNd(*conv87->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp88->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv89 = convBnSilu(network, weightMap, *mp88->getOutput(0), 160, 1, 1, 0, "model.89"); IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *conv87->getOutput(0), 160, 1, 1, 0, "model.90"); IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv90->getOutput(0), 160, 3, 2, 1, "model.91"); ITensor* input_tensor_92[] = { conv91->getOutput(0), conv89->getOutput(0),conv73->getOutput(0) }; IConcatenationLayer* concat92 = network->addConcatenation(input_tensor_92, 3); IElementWiseLayer* conv93 = convBnSilu(network, weightMap, *concat92->getOutput(0), 256, 1, 1, 0, "model.93"); IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *concat92->getOutput(0), 256, 1, 1, 0, "model.94"); IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *conv94->getOutput(0), 256, 3, 1, 1, "model.95"); IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 256, 3, 1, 1, "model.96"); IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 256, 3, 1, 1, "model.97"); IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 256, 3, 1, 1, "model.98"); IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *conv98->getOutput(0), 256, 3, 1, 1, "model.99"); IElementWiseLayer* conv100 = convBnSilu(network, weightMap, *conv99->getOutput(0), 256, 3, 1, 1, "model.100"); ITensor* input_tensor_101[] = { conv100->getOutput(0), conv98->getOutput(0), conv96->getOutput(0), conv94->getOutput(0),conv93->getOutput(0) }; IConcatenationLayer* concat101 = network->addConcatenation(input_tensor_101, 5); //concat9->setAxis(0); IElementWiseLayer* conv102 = convBnSilu(network, weightMap, *concat101->getOutput(0), 320, 1, 1, 0, "model.102"); IPoolingLayer* mp103 = network->addPoolingNd(*conv102->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp103->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv104 = convBnSilu(network, weightMap, *mp103->getOutput(0), 320, 1, 1, 0, "model.104"); IElementWiseLayer* conv105 = convBnSilu(network, weightMap, *conv102->getOutput(0), 320, 1, 1, 0, "model.105"); IElementWiseLayer* conv106 = convBnSilu(network, weightMap, *conv105->getOutput(0), 320, 3, 2, 1, "model.106"); ITensor* input_tensor_107[] = { conv106->getOutput(0), conv104->getOutput(0),conv59->getOutput(0) }; IConcatenationLayer* concat107 = network->addConcatenation(input_tensor_107, 3); IElementWiseLayer* conv108 = convBnSilu(network, weightMap, *concat107->getOutput(0), 512, 1, 1, 0, "model.108"); IElementWiseLayer* conv109 = convBnSilu(network, weightMap, *concat107->getOutput(0), 512, 1, 1, 0, "model.109"); IElementWiseLayer* conv110 = convBnSilu(network, weightMap, *conv109->getOutput(0), 512, 3, 1, 1, "model.110"); IElementWiseLayer* conv111 = convBnSilu(network, weightMap, *conv110->getOutput(0), 512, 3, 1, 1, "model.111"); IElementWiseLayer* conv112 = convBnSilu(network, weightMap, *conv111->getOutput(0), 512, 3, 1, 1, "model.112"); IElementWiseLayer* conv113 = convBnSilu(network, weightMap, *conv112->getOutput(0), 512, 3, 1, 1, "model.113"); IElementWiseLayer* conv114 = convBnSilu(network, weightMap, *conv113->getOutput(0), 512, 3, 1, 1, "model.114"); IElementWiseLayer* conv115 = convBnSilu(network, weightMap, *conv114->getOutput(0), 512, 3, 1, 1, "model.115"); ITensor* input_tensor_116[] = { conv115->getOutput(0), conv113->getOutput(0), conv111->getOutput(0), conv109->getOutput(0),conv108->getOutput(0) }; IConcatenationLayer* concat116 = network->addConcatenation(input_tensor_116, 5); //concat9->setAxis(0); IElementWiseLayer* conv117 = convBnSilu(network, weightMap, *concat116->getOutput(0), 640, 1, 1, 0, "model.117"); IElementWiseLayer* con_0 = convBnSilu(network, weightMap, *conv87->getOutput(0), 320, 3, 1, 1, "model.118"); IElementWiseLayer* con_1 = convBnSilu(network, weightMap, *conv102->getOutput(0), 640, 3, 1, 1, "model.119"); IElementWiseLayer* con_2 = convBnSilu(network, weightMap, *conv117->getOutput(0), 1280, 3, 1, 1, "model.120"); /*----------------------------------yolov7 out-----------------------------------------*/ IConvolutionLayer* det0 = network->addConvolutionNd(*con_0->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.121.m.0.weight"], weightMap["model.121.m.0.bias"]); assert(det0); det0->setName("det0"); IConvolutionLayer* det1 = network->addConvolutionNd(*con_1->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.121.m.1.weight"], weightMap["model.121.m.1.bias"]); assert(det1); det1->setName("det1"); IConvolutionLayer* det2 = network->addConvolutionNd(*con_2->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.121.m.2.weight"], weightMap["model.121.m.2.bias"]); assert(det2); det2->setName("det2"); auto yolo = addYoLoLayer(network, weightMap, "model.121", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov7(unsigned int maxBatchSize,IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& wts_path) { std::map weightMap = loadWeights(wts_path); INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); /*----------------------------------yolov7 backbone-----------------------------------------*/ IElementWiseLayer* conv0 = convBnSilu(network, weightMap, *data, 32, 3, 1, 1, "model.0"); IElementWiseLayer* conv1 = convBnSilu(network, weightMap, *conv0->getOutput(0), 64, 3, 2, 1, "model.1"); IElementWiseLayer* conv2 = convBnSilu(network, weightMap, *conv1->getOutput(0), 64, 3, 1, 1, "model.2"); IElementWiseLayer* conv3 = convBnSilu(network, weightMap, *conv2->getOutput(0), 128, 3, 2, 1, "model.3"); IElementWiseLayer* conv4 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.4"); IElementWiseLayer* conv5 = convBnSilu(network, weightMap, *conv3->getOutput(0), 64, 1, 1, 0, "model.5"); IElementWiseLayer* conv6 = convBnSilu(network, weightMap, *conv5->getOutput(0), 64, 3, 1, 1, "model.6"); IElementWiseLayer* conv7 = convBnSilu(network, weightMap, *conv6->getOutput(0), 64, 3, 1, 1, "model.7"); IElementWiseLayer* conv8 = convBnSilu(network, weightMap, *conv7->getOutput(0), 64, 3, 1, 1, "model.8"); IElementWiseLayer* conv9 = convBnSilu(network, weightMap, *conv8->getOutput(0), 64, 3, 1, 1, "model.9"); ITensor* input_tensor_10[] = { conv9->getOutput(0), conv7->getOutput(0), conv5->getOutput(0), conv4->getOutput(0) }; IConcatenationLayer* concat10 = network->addConcatenation(input_tensor_10, 4); concat10->setAxis(0); IElementWiseLayer* conv11 = convBnSilu(network, weightMap, *concat10->getOutput(0), 256, 1, 1, 0, "model.11"); IPoolingLayer* mp12 = network->addPoolingNd(*conv11->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp12->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv13 = convBnSilu(network, weightMap, *mp12->getOutput(0), 128, 1, 1, 0, "model.13"); IElementWiseLayer* conv14 = convBnSilu(network, weightMap, *conv11->getOutput(0), 128, 1, 1, 0, "model.14"); IElementWiseLayer* conv15 = convBnSilu(network, weightMap, *conv14->getOutput(0), 128, 3, 2, 1, "model.15"); ITensor* input_tensor_16[] = { conv15->getOutput(0), conv13->getOutput(0) }; IConcatenationLayer* concat16 = network->addConcatenation(input_tensor_16, 2); IElementWiseLayer* conv17 = convBnSilu(network, weightMap, *concat16->getOutput(0), 128, 1, 1, 0, "model.17"); IElementWiseLayer* conv18 = convBnSilu(network, weightMap, *concat16->getOutput(0), 128, 1, 1, 0, "model.18"); IElementWiseLayer* conv19 = convBnSilu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19"); IElementWiseLayer* conv20 = convBnSilu(network, weightMap, *conv19->getOutput(0), 128, 3, 1, 1, "model.20"); IElementWiseLayer* conv21 = convBnSilu(network, weightMap, *conv20->getOutput(0), 128, 3, 1, 1, "model.21"); IElementWiseLayer* conv22 = convBnSilu(network, weightMap, *conv21->getOutput(0), 128, 3, 1, 1, "model.22"); ITensor* input_tensor_23[] = { conv22->getOutput(0), conv20->getOutput(0), conv18->getOutput(0), conv17->getOutput(0) }; IConcatenationLayer* concat23 = network->addConcatenation(input_tensor_23, 4); concat23->setAxis(0); IElementWiseLayer* conv24 = convBnSilu(network, weightMap, *concat23->getOutput(0), 512, 1, 1, 0, "model.24"); IPoolingLayer* mp25 = network->addPoolingNd(*conv24->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp25->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv26 = convBnSilu(network, weightMap, *mp25->getOutput(0), 256, 1, 1, 0, "model.26"); IElementWiseLayer* conv27 = convBnSilu(network, weightMap, *conv24->getOutput(0), 256, 1, 1, 0, "model.27"); IElementWiseLayer* conv28 = convBnSilu(network, weightMap, *conv27->getOutput(0), 256, 3, 2, 1, "model.28"); ITensor* input_tensor_29[] = { conv28->getOutput(0), conv26->getOutput(0) }; IConcatenationLayer* concat29 = network->addConcatenation(input_tensor_29, 2); IElementWiseLayer* conv30 = convBnSilu(network, weightMap, *concat29->getOutput(0), 256, 1, 1, 0, "model.30"); IElementWiseLayer* conv31 = convBnSilu(network, weightMap, *concat29->getOutput(0), 256, 1, 1, 0, "model.31"); IElementWiseLayer* conv32 = convBnSilu(network, weightMap, *conv31->getOutput(0), 256, 3, 1, 1, "model.32"); IElementWiseLayer* conv33 = convBnSilu(network, weightMap, *conv32->getOutput(0), 256, 3, 1, 1, "model.33"); IElementWiseLayer* conv34 = convBnSilu(network, weightMap, *conv33->getOutput(0), 256, 3, 1, 1, "model.34"); IElementWiseLayer* conv35 = convBnSilu(network, weightMap, *conv34->getOutput(0), 256, 3, 1, 1, "model.35"); ITensor* input_tensor_36[] = { conv35->getOutput(0), conv33->getOutput(0), conv31->getOutput(0), conv30->getOutput(0) }; IConcatenationLayer* concat36 = network->addConcatenation(input_tensor_36, 4); concat36->setAxis(0); IElementWiseLayer* conv37 = convBnSilu(network, weightMap, *concat36->getOutput(0), 1024, 1, 1, 0, "model.37"); IPoolingLayer* mp38 = network->addPoolingNd(*conv37->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp38->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv39 = convBnSilu(network, weightMap, *mp38->getOutput(0), 512, 1, 1, 0, "model.39"); IElementWiseLayer* conv40 = convBnSilu(network, weightMap, *conv37->getOutput(0), 512, 1, 1, 0, "model.40"); IElementWiseLayer* conv41 = convBnSilu(network, weightMap, *conv40->getOutput(0), 512, 3, 2, 1, "model.41"); ITensor* input_tensor_42[] = { conv41->getOutput(0), conv39->getOutput(0) }; IConcatenationLayer* concat42 = network->addConcatenation(input_tensor_42, 2); concat42->setAxis(0); IElementWiseLayer* conv43 = convBnSilu(network, weightMap, *concat42->getOutput(0), 256, 1, 1, 0, "model.43"); IElementWiseLayer* conv44 = convBnSilu(network, weightMap, *concat42->getOutput(0), 256, 1, 1, 0, "model.44"); IElementWiseLayer* conv45 = convBnSilu(network, weightMap, *conv44->getOutput(0), 256, 3, 1, 1, "model.45"); IElementWiseLayer* conv46 = convBnSilu(network, weightMap, *conv45->getOutput(0), 256, 3, 1, 1, "model.46"); IElementWiseLayer* conv47 = convBnSilu(network, weightMap, *conv46->getOutput(0), 256, 3, 1, 1, "model.47"); IElementWiseLayer* conv48 = convBnSilu(network, weightMap, *conv47->getOutput(0), 256, 3, 1, 1, "model.48"); ITensor* input_tensor_49[] = { conv48->getOutput(0), conv46->getOutput(0), conv44->getOutput(0), conv43->getOutput(0) }; IConcatenationLayer* concat49 = network->addConcatenation(input_tensor_49, 4); concat49->setAxis(0); IElementWiseLayer* conv50 = convBnSilu(network, weightMap, *concat49->getOutput(0), 1024, 1, 1, 0, "model.50"); /*----------------------------------yolov7 head-----------------------------------------*/ IElementWiseLayer* conv51 = SPPCSPC(network, weightMap, *conv50->getOutput(0), 512, "model.51"); IElementWiseLayer* conv52 = convBnSilu(network, weightMap, *conv51->getOutput(0), 256, 1, 1, 0, "model.52"); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* re53 = network->addResize(*conv52->getOutput(0)); re53->setResizeMode(ResizeMode::kNEAREST); re53->setScales(scale, 3); IElementWiseLayer* conv54 = convBnSilu(network, weightMap, *conv37->getOutput(0), 256, 1, 1, 0, "model.54"); ITensor* input_tensor_55[] = { conv54->getOutput(0), re53->getOutput(0) }; IConcatenationLayer* concat55 = network->addConcatenation(input_tensor_55, 2); concat55->setAxis(0); IElementWiseLayer* conv56 = convBnSilu(network, weightMap, *concat55->getOutput(0), 256, 1, 1, 0, "model.56"); IElementWiseLayer* conv57 = convBnSilu(network, weightMap, *concat55->getOutput(0), 256, 1, 1, 0, "model.57"); IElementWiseLayer* conv58 = convBnSilu(network, weightMap, *conv57->getOutput(0), 128, 3, 1, 1, "model.58"); IElementWiseLayer* conv59 = convBnSilu(network, weightMap, *conv58->getOutput(0), 128, 3, 1, 1, "model.59"); IElementWiseLayer* conv60 = convBnSilu(network, weightMap, *conv59->getOutput(0), 128, 3, 1, 1, "model.60"); IElementWiseLayer* conv61 = convBnSilu(network, weightMap, *conv60->getOutput(0), 128, 3, 1, 1, "model.61"); ITensor* input_tensor_62[] = { conv61->getOutput(0), conv60->getOutput(0), conv59->getOutput(0), conv58->getOutput(0), conv57->getOutput(0), conv56->getOutput(0) }; IConcatenationLayer* concat62 = network->addConcatenation(input_tensor_62, 6); concat62->setAxis(0); IElementWiseLayer* conv63 = convBnSilu(network, weightMap, *concat62->getOutput(0), 256, 1, 1, 0, "model.63"); IElementWiseLayer* conv64 = convBnSilu(network, weightMap, *conv63->getOutput(0), 128, 1, 1, 0, "model.64"); IResizeLayer* re65 = network->addResize(*conv64->getOutput(0)); re65->setResizeMode(ResizeMode::kNEAREST); re65->setScales(scale, 3); IElementWiseLayer* conv66 = convBnSilu(network, weightMap, *conv24->getOutput(0), 128, 1, 1, 0, "model.66"); ITensor* input_tensor_67[] = { conv66->getOutput(0), re65->getOutput(0) }; IConcatenationLayer* concat67 = network->addConcatenation(input_tensor_67, 2); concat67->setAxis(0); IElementWiseLayer* conv68 = convBnSilu(network, weightMap, *concat67->getOutput(0), 128, 1, 1, 0, "model.68"); IElementWiseLayer* conv69 = convBnSilu(network, weightMap, *concat67->getOutput(0), 128, 1, 1, 0, "model.69"); IElementWiseLayer* conv70 = convBnSilu(network, weightMap, *conv69->getOutput(0), 64, 3, 1, 1, "model.70"); IElementWiseLayer* conv71 = convBnSilu(network, weightMap, *conv70->getOutput(0), 64, 3, 1, 1, "model.71"); IElementWiseLayer* conv72 = convBnSilu(network, weightMap, *conv71->getOutput(0), 64, 3, 1, 1, "model.72"); IElementWiseLayer* conv73 = convBnSilu(network, weightMap, *conv72->getOutput(0), 64, 3, 1, 1, "model.73"); ITensor* input_tensor_74[] = { conv73->getOutput(0), conv72->getOutput(0), conv71->getOutput(0), conv70->getOutput(0), conv69->getOutput(0), conv68->getOutput(0) }; IConcatenationLayer* concat74 = network->addConcatenation(input_tensor_74, 6); concat74->setAxis(0); IElementWiseLayer* conv75 = convBnSilu(network, weightMap, *concat74->getOutput(0), 128, 1, 1, 0, "model.75"); IPoolingLayer* mp76 = network->addPoolingNd(*conv75->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp76->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv77 = convBnSilu(network, weightMap, *mp76->getOutput(0), 128, 1, 1, 0, "model.77"); IElementWiseLayer* conv78 = convBnSilu(network, weightMap, *conv75->getOutput(0), 128, 1, 1, 0, "model.78"); IElementWiseLayer* conv79 = convBnSilu(network, weightMap, *conv78->getOutput(0), 128, 3, 2, 1, "model.79"); ITensor* input_tensor_80[] = { conv79->getOutput(0), conv77->getOutput(0), conv63->getOutput(0) }; IConcatenationLayer* concat80 = network->addConcatenation(input_tensor_80, 3); concat80->setAxis(0); IElementWiseLayer* conv81 = convBnSilu(network, weightMap, *concat80->getOutput(0), 256, 1, 1, 0, "model.81"); IElementWiseLayer* conv82 = convBnSilu(network, weightMap, *concat80->getOutput(0), 256, 1, 1, 0, "model.82"); IElementWiseLayer* conv83 = convBnSilu(network, weightMap, *conv82->getOutput(0), 128, 3, 1, 1, "model.83"); IElementWiseLayer* conv84 = convBnSilu(network, weightMap, *conv83->getOutput(0), 128, 3, 1, 1, "model.84"); IElementWiseLayer* conv85 = convBnSilu(network, weightMap, *conv84->getOutput(0), 128, 3, 1, 1, "model.85"); IElementWiseLayer* conv86 = convBnSilu(network, weightMap, *conv85->getOutput(0), 128, 3, 1, 1, "model.86"); ITensor* input_tensor_87[] = { conv86->getOutput(0), conv85->getOutput(0), conv84->getOutput(0), conv83->getOutput(0), conv82->getOutput(0), conv81->getOutput(0) }; IConcatenationLayer* concat87 = network->addConcatenation(input_tensor_87, 6); concat87->setAxis(0); IElementWiseLayer* conv88 = convBnSilu(network, weightMap, *concat87->getOutput(0), 256, 1, 1, 0, "model.88"); IPoolingLayer* mp89 = network->addPoolingNd(*conv88->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); mp89->setStrideNd(DimsHW{ 2, 2 }); IElementWiseLayer* conv90 = convBnSilu(network, weightMap, *mp89->getOutput(0), 256, 1, 1, 0, "model.90"); IElementWiseLayer* conv91 = convBnSilu(network, weightMap, *conv88->getOutput(0), 256, 1, 1, 0, "model.91"); IElementWiseLayer* conv92 = convBnSilu(network, weightMap, *conv91->getOutput(0), 256, 3, 2, 1, "model.92"); ITensor* input_tensor_93[] = { conv92->getOutput(0), conv90->getOutput(0), conv51->getOutput(0) }; IConcatenationLayer* concat93 = network->addConcatenation(input_tensor_93, 3); concat93->setAxis(0); IElementWiseLayer* conv94 = convBnSilu(network, weightMap, *concat93->getOutput(0), 512, 1, 1, 0, "model.94"); IElementWiseLayer* conv95 = convBnSilu(network, weightMap, *concat93->getOutput(0), 512, 1, 1, 0, "model.95"); IElementWiseLayer* conv96 = convBnSilu(network, weightMap, *conv95->getOutput(0), 256, 3, 1, 1, "model.96"); IElementWiseLayer* conv97 = convBnSilu(network, weightMap, *conv96->getOutput(0), 256, 3, 1, 1, "model.97"); IElementWiseLayer* conv98 = convBnSilu(network, weightMap, *conv97->getOutput(0), 256, 3, 1, 1, "model.98"); IElementWiseLayer* conv99 = convBnSilu(network, weightMap, *conv98->getOutput(0), 256, 3, 1, 1, "model.99"); ITensor* input_tensor_100[] = { conv99->getOutput(0), conv98->getOutput(0), conv97->getOutput(0), conv96->getOutput(0), conv95->getOutput(0), conv94->getOutput(0) }; IConcatenationLayer* concat100 = network->addConcatenation(input_tensor_100, 6); concat100->setAxis(0); IElementWiseLayer* conv101 = convBnSilu(network, weightMap, *concat100->getOutput(0), 512, 1, 1, 0, "model.101"); IElementWiseLayer* conv102 = RepConv(network, weightMap, *conv75->getOutput(0), 256, 3, 1, "model.102"); IElementWiseLayer* conv103 = RepConv(network, weightMap, *conv88->getOutput(0), 512, 3, 1, "model.103"); IElementWiseLayer* conv104 = RepConv(network, weightMap, *conv101->getOutput(0), 1024, 3, 1, "model.104"); /*----------------------------------yolov7 out-----------------------------------------*/ IConvolutionLayer* cv105_0 = network->addConvolutionNd(*conv102->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.105.m.0.weight"], weightMap["model.105.m.0.bias"]); assert(cv105_0); cv105_0->setName("cv105.0"); IConvolutionLayer* cv105_1 = network->addConvolutionNd(*conv103->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.105.m.1.weight"], weightMap["model.105.m.1.bias"]); assert(cv105_1); cv105_1->setName("cv105.1"); IConvolutionLayer* cv105_2 = network->addConvolutionNd(*conv104->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.105.m.2.weight"], weightMap["model.105.m.2.bias"]); assert(cv105_2); cv105_2->setName("cv105.2"); auto yolo = addYoLoLayer(network, weightMap, "model.105", std::vector{cv105_0, cv105_1, cv105_2}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov7_tiny(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{ 3, kInputH, kInputW }); assert(data); std::map weightMap = loadWeights(wts_name); /* ------ yolov7-tiny backbone------ */ // [32, 3, 2, None, 1, nn.LeakyReLU(0.1)]]---> outch、ksize、stride、padding、groups------ auto conv0 = convBlockLeakRelu(network, weightMap, *data, 32, 3, 2, 1, "model.0"); assert(conv0); // [-1, 1, Conv, [64, 3, 2, None, 1, nn.LeakyReLU(0.1)]], # 1-P2/4 auto conv1 = convBlockLeakRelu(network, weightMap, *conv0->getOutput(0), 64, 3, 2, 1, "model.1"); assert(conv1); // [-1, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv2 = convBlockLeakRelu(network, weightMap, *conv1->getOutput(0), 32, 1, 1, 0, "model.2"); assert(conv2); // [-2, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv3 = convBlockLeakRelu(network, weightMap, *conv1->getOutput(0), 32, 1, 1, 0, "model.3"); assert(conv3); // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv4 = convBlockLeakRelu(network, weightMap, *conv3->getOutput(0), 32, 3, 1, 1, "model.4"); assert(conv4); // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv5 = convBlockLeakRelu(network, weightMap, *conv4->getOutput(0), 32, 3, 1, 1, "model.5"); assert(conv5); ITensor* input_tensor_6[] = { conv5->getOutput(0), conv4->getOutput(0), conv3->getOutput(0), conv2->getOutput(0) }; auto cat6 = network->addConcatenation(input_tensor_6, 4); //cat6->setAxis(0); // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 7 auto conv7 = convBlockLeakRelu(network, weightMap, *cat6->getOutput(0), 64, 1, 1, 0, "model.7"); assert(conv7); auto* pool8 = network->addPoolingNd(*conv7->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); assert(pool8); pool8->setStrideNd(DimsHW{ 2, 2 }); //[-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]] , auto conv9 = convBlockLeakRelu(network, weightMap, *pool8->getOutput(0), 64, 1, 1, 0, "model.9"); assert(conv9); // [-2, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv10 = convBlockLeakRelu(network, weightMap, *pool8->getOutput(0), 64, 1, 1, 0, "model.10"); assert(conv10); //[-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv11 = convBlockLeakRelu(network, weightMap, *conv10->getOutput(0), 64, 3, 1, 1, "model.11"); assert(conv11); //[-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv12 = convBlockLeakRelu(network, weightMap, *conv11->getOutput(0), 64, 3, 1, 1, "model.12"); assert(conv12); ITensor* input_tensor_13[] = { conv12->getOutput(0), conv11->getOutput(0), conv10->getOutput(0), conv9->getOutput(0) }; auto cat13 = network->addConcatenation(input_tensor_13, 4); //cat2->setAxis(0); // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 14 auto conv14 = convBlockLeakRelu(network, weightMap, *cat13->getOutput(0), 128, 1, 1, 0, "model.14"); assert(conv14); auto* pool15 = network->addPoolingNd(*conv14->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); assert(pool15); pool15->setStrideNd(DimsHW{ 2, 2 }); // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv16 = convBlockLeakRelu(network, weightMap, *pool15->getOutput(0), 128, 1, 1, 0, "model.16"); assert(conv16); //[-2, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv17 = convBlockLeakRelu(network, weightMap, *pool15->getOutput(0), 128, 1, 1, 0, "model.17"); assert(conv17); //[-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv18 = convBlockLeakRelu(network, weightMap, *conv17->getOutput(0), 128, 3, 1, 1, "model.18"); assert(conv18); // [-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv19 = convBlockLeakRelu(network, weightMap, *conv18->getOutput(0), 128, 3, 1, 1, "model.19"); assert(conv19); ITensor* input_tensor_20[] = { conv19->getOutput(0), conv18->getOutput(0), conv17->getOutput(0), conv16->getOutput(0) }; auto cat20 = network->addConcatenation(input_tensor_20, 4); //cat20->setAxis(0); //[-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 21 auto conv21 = convBlockLeakRelu(network, weightMap, *cat20->getOutput(0), 256, 1, 1, 0, "model.21"); assert(conv21); auto* pool22 = network->addPoolingNd(*conv21->getOutput(0), PoolingType::kMAX, DimsHW{ 2, 2 }); assert(pool22); pool22->setStrideNd(DimsHW{ 2, 2 }); // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv23 = convBlockLeakRelu(network, weightMap, *pool22->getOutput(0), 256, 1, 1, 0, "model.23"); assert(conv23); // [-2, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv24 = convBlockLeakRelu(network, weightMap, *pool22->getOutput(0), 256, 1, 1, 0, "model.24"); assert(conv24); // [-1, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv25 = convBlockLeakRelu(network, weightMap, *conv24->getOutput(0), 256, 3, 1, 1, "model.25"); assert(conv25); // [-1, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv26 = convBlockLeakRelu(network, weightMap, *conv25->getOutput(0), 256, 3, 1, 1, "model.26"); assert(conv26); ITensor* input_tensor_27[] = { conv26->getOutput(0), conv25->getOutput(0), conv24->getOutput(0), conv23->getOutput(0) }; auto cat27 = network->addConcatenation(input_tensor_27, 4); //cat27->setAxis(0); // [-1, 1, Conv, [512, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 28 auto conv28 = convBlockLeakRelu(network, weightMap, *cat27->getOutput(0), 512, 1, 1, 0, "model.28"); assert(conv28); /*===============================yolov7-tiny head======================================*/ // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]] auto conv29 = convBlockLeakRelu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.29"); assert(conv29); // [-2, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv30 = convBlockLeakRelu(network, weightMap, *conv28->getOutput(0), 256, 1, 1, 0, "model.30"); assert(conv30); //[-1, 1, SP, [5]], auto* pool31 = network->addPoolingNd(*conv30->getOutput(0), PoolingType::kMAX, DimsHW{ 5, 5 }); assert(pool31); pool31->setStrideNd(DimsHW{ 1, 1 }); pool31->setPaddingNd(DimsHW{ 2, 2 }); // [-2, 1, SP, [9]], auto* pool32 = network->addPoolingNd(*conv30->getOutput(0), PoolingType::kMAX, DimsHW{ 9, 9 }); assert(pool32); pool32->setStrideNd(DimsHW{ 1, 1 }); pool32->setPaddingNd(DimsHW{ 4, 4 }); // [-3, 1, SP, [13]], auto* pool33 = network->addPoolingNd(*conv30->getOutput(0), PoolingType::kMAX, DimsHW{ 13, 13 }); assert(pool33); pool33->setStrideNd(DimsHW{ 1, 1 }); pool33->setPaddingNd(DimsHW{ 6, 6 }); ITensor* input_tensor_34[] = { pool33->getOutput(0), pool32->getOutput(0), pool31->getOutput(0), conv30->getOutput(0) }; auto cat34 = network->addConcatenation(input_tensor_34, 4); //cat34->setAxis(0); // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv35 = convBlockLeakRelu(network, weightMap, *cat34->getOutput(0), 256, 1, 1, 0, "model.35"); assert(conv35); ITensor* input_tensor_36[] = { conv35->getOutput(0), conv29->getOutput(0) }; auto cat36 = network->addConcatenation(input_tensor_36, 2); //cat36->setAxis(0); // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 37 auto conv37 = convBlockLeakRelu(network, weightMap, *cat36->getOutput(0), 256, 1, 1, 0, "model.37"); assert(conv37); // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv38 = convBlockLeakRelu(network, weightMap, *conv37->getOutput(0), 128, 1, 1, 0, "model.38"); assert(conv38); float scale[] = { 1.0, 2.0, 2.0 }; IResizeLayer* resize39 = network->addResize(*conv38->getOutput(0)); resize39->setResizeMode(ResizeMode::kNEAREST); resize39->setScales(scale, 3); // [21, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P4 ---->conv16 auto conv40 = convBlockLeakRelu(network, weightMap, *conv21->getOutput(0), 128, 1, 1, 0, "model.40"); assert(conv40); ITensor* input_tensor_41[] = { conv40->getOutput(0), resize39->getOutput(0) }; auto cat41 = network->addConcatenation(input_tensor_41, 2); //cat41->setAxis(0); // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv42 = convBlockLeakRelu(network, weightMap, *cat41->getOutput(0), 64, 1, 1, 0, "model.42"); assert(conv42); //[-2, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv43 = convBlockLeakRelu(network, weightMap, *cat41->getOutput(0), 64, 1, 1, 0, "model.43"); assert(conv43); // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv44 = convBlockLeakRelu(network, weightMap, *conv43->getOutput(0), 64, 3, 1, 1, "model.44"); assert(conv44); // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv45 = convBlockLeakRelu(network, weightMap, *conv44->getOutput(0), 64, 3, 1, 1, "model.45"); assert(conv45); ITensor* input_tensor_46[] = { conv45->getOutput(0), conv44->getOutput(0), conv43->getOutput(0), conv42->getOutput(0) }; auto cat46 = network->addConcatenation(input_tensor_46, 4); //cat46->setAxis(0); // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 47 auto conv47 = convBlockLeakRelu(network, weightMap, *cat46->getOutput(0), 128, 1, 1, 0, "model.47"); assert(conv47); // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv48 = convBlockLeakRelu(network, weightMap, *conv47->getOutput(0), 64, 1, 1, 0, "model.48"); assert(conv48); IResizeLayer* resize49 = network->addResize(*conv48->getOutput(0)); resize49->setResizeMode(ResizeMode::kNEAREST); resize49->setScales(scale, 3); // [14, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # route backbone P3 conv11 auto conv50 = convBlockLeakRelu(network, weightMap, *conv14->getOutput(0), 64, 1, 1, 0, "model.50"); assert(conv50); ITensor* input_tensor_51[] = { conv50->getOutput(0), resize49->getOutput(0) }; IConcatenationLayer* cat51 = network->addConcatenation(input_tensor_51, 2); //cat51->setAxis(0); // [-1, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv52 = convBlockLeakRelu(network, weightMap, *cat51->getOutput(0), 32, 1, 1, 0, "model.52"); assert(conv52); // [-2, 1, Conv, [32, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv53 = convBlockLeakRelu(network, weightMap, *cat51->getOutput(0), 32, 1, 1, 0, "model.53"); assert(conv53); // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv54 = convBlockLeakRelu(network, weightMap, *conv53->getOutput(0), 32, 3, 1, 1, "model.54"); assert(conv54); // [-1, 1, Conv, [32, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv55 = convBlockLeakRelu(network, weightMap, *conv54->getOutput(0), 32, 3, 1, 1, "model.55"); assert(conv55); ITensor* input_tensor_56[] = { conv55->getOutput(0), conv54->getOutput(0), conv53->getOutput(0),conv52->getOutput(0) }; IConcatenationLayer* cat56 = network->addConcatenation(input_tensor_56, 4); //cat56->setAxis(0); // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 57 auto conv57 = convBlockLeakRelu(network, weightMap, *cat56->getOutput(0), 64, 1, 1, 0, "model.57"); assert(conv57); // [-1, 1, Conv, [128, 3, 2, None, 1, nn.LeakyReLU(0.1)]], auto conv58 = convBlockLeakRelu(network, weightMap, *conv57->getOutput(0), 128, 3, 2, 1, "model.58"); assert(conv58); // conv32 [[-1, 47], 1, Concat, [1]], ITensor* input_tensor_59[] = { conv58->getOutput(0), conv47->getOutput(0) }; IConcatenationLayer* cat59 = network->addConcatenation(input_tensor_59, 2); //cat59->setAxis(0); // [-1, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv60 = convBlockLeakRelu(network, weightMap, *cat59->getOutput(0), 64, 1, 1, 0, "model.60"); assert(conv60); // [-2, 1, Conv, [64, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv61 = convBlockLeakRelu(network, weightMap, *cat59->getOutput(0), 64, 1, 1, 0, "model.61"); assert(conv61); // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv62 = convBlockLeakRelu(network, weightMap, *conv61->getOutput(0), 64, 3, 1, 1, "model.62"); assert(conv62); // [-1, 1, Conv, [64, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv63 = convBlockLeakRelu(network, weightMap, *conv62->getOutput(0), 64, 3, 1, 1, "model.63"); assert(conv63); ITensor* input_tensor_64[] = { conv63->getOutput(0), conv62->getOutput(0), conv61->getOutput(0), conv60->getOutput(0) }; IConcatenationLayer* cat64 = network->addConcatenation(input_tensor_64, 4); //cat64->setAxis(0); // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]] , # 65 auto conv65 = convBlockLeakRelu(network, weightMap, *cat64->getOutput(0), 128, 1, 1, 0, "model.65"); assert(conv65); // [-1, 1, Conv, [256, 3, 2, None, 1, nn.LeakyReLU(0.1)]] , auto conv66 = convBlockLeakRelu(network, weightMap, *conv65->getOutput(0), 256, 3, 2, 1, "model.66"); assert(conv66); ITensor* input_tensor_67[] = { conv66->getOutput(0), conv37->getOutput(0) }; IConcatenationLayer* cat67 = network->addConcatenation(input_tensor_67, 2); //cat67->setAxis(0); // [-1, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv68 = convBlockLeakRelu(network, weightMap, *cat67->getOutput(0), 128, 1, 1, 0, "model.68"); assert(conv68); // [-2, 1, Conv, [128, 1, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv69 = convBlockLeakRelu(network, weightMap, *cat67->getOutput(0), 128, 1, 1, 0, "model.69"); assert(conv69); // [-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv70 = convBlockLeakRelu(network, weightMap, *conv69->getOutput(0), 128, 3, 1, 1, "model.70"); assert(conv70); // [-1, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv71 = convBlockLeakRelu(network, weightMap, *conv70->getOutput(0), 128, 3, 1, 1, "model.71"); assert(conv71); ITensor* input_tensor_72[] = { conv71->getOutput(0), conv70->getOutput(0), conv69->getOutput(0), conv68->getOutput(0) }; IConcatenationLayer* cat72 = network->addConcatenation(input_tensor_72, 4); //cat72->setAxis(0); // [-1, 1, Conv, [256, 1, 1, None, 1, nn.LeakyReLU(0.1)]], # 73 auto conv73 = convBlockLeakRelu(network, weightMap, *cat72->getOutput(0), 256, 1, 1, 0, "model.73"); assert(conv73); // [57, 1, Conv, [128, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv74 = convBlockLeakRelu(network, weightMap, *conv57->getOutput(0), 128, 3, 1, 1, "model.74"); assert(conv74); // [65, 1, Conv, [256, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv75 = convBlockLeakRelu(network, weightMap, *conv65->getOutput(0), 256, 3, 1, 1, "model.75"); assert(conv75); // [73, 1, Conv, [512, 3, 1, None, 1, nn.LeakyReLU(0.1)]], auto conv76 = convBlockLeakRelu(network, weightMap, *conv73->getOutput(0), 512, 3, 1, 1, "model.76"); assert(conv76); /* ------ detect ------ */ IConvolutionLayer* det0 = network->addConvolutionNd(*conv74->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.77.m.0.weight"], weightMap["model.77.m.0.bias"]); IConvolutionLayer* det1 = network->addConvolutionNd(*conv75->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.77.m.1.weight"], weightMap["model.77.m.1.bias"]); IConvolutionLayer* det2 = network->addConvolutionNd(*conv76->getOutput(0), kNumAnchor * (kNumClass + 5), DimsHW{ 1, 1 }, weightMap["model.77.m.2.weight"], weightMap["model.77.m.2.bias"]); auto yolo = addYoLoLayer(network, weightMap, "model.77", std::vector{det0, det1, det2}); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Build engine builder->setMaxBatchSize(maxBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB #if defined(USE_FP16) config->setFlag(BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, "./coco_calib/", "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Don't need the network any more delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov7/src/postprocess.cpp ================================================ #include "postprocess.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2] / 2.f; r = bbox[0] + bbox[2] / 2.f; t = bbox[1] - bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2; b = bbox[1] + bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2; r = bbox[0] + bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2; t = bbox[1] - bbox[3] / 2.f; b = bbox[1] + bbox[3] / 2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(round(l), round(t), round(r - l), round(b - t)); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]); return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); } static bool cmp(const Detection& a, const Detection& b) { return a.conf > b.conf; } void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } ================================================ FILE: yolov7/src/preprocess.cu ================================================ #include "preprocess.h" #include "cuda_utils.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; struct AffineMatrix{ float value[6]; }; __global__ void warpaffine_kernel( uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess( uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>( img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov7/yolov7_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov7 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov7TRT(object): """ description: A YOLOv7 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] - x[:, 2] / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] - x[:, 3] / 2 y[:, 3] = x[:, 1] + x[:, 3] / 2 y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 6))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov7_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov7_wrapper = yolov7_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov7_wrapper.infer(self.yolov7_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov7_wrapper): threading.Thread.__init__(self) self.yolov7_wrapper = yolov7_wrapper def run(self): batch_image_raw, use_time = self.yolov7_wrapper.infer(self.yolov7_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "yolov7.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov7TRT instance yolov7_wrapper = YoLov7TRT(engine_file_path) try: print('batch size is', yolov7_wrapper.batch_size) image_dir = "samples/" image_path_batches = get_img_path_batches(yolov7_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov7_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov7_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov7_wrapper.destroy() ================================================ FILE: yolov8/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(yolov8) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) # tensorrt include_directories(/home/lindsay/TensorRT-8.6.1.6/include) link_directories(/home/lindsay/TensorRT-8.6.1.6/lib) # include_directories(/home/lindsay/TensorRT-7.2.3.4/include) # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS}) target_link_libraries(yolov8_det nvinfer) target_link_libraries(yolov8_det cudart) target_link_libraries(yolov8_det myplugins) target_link_libraries(yolov8_det ${OpenCV_LIBS}) add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS}) target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS}) add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS}) target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS}) add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS}) target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS}) add_executable(yolov8_5u_det ${PROJECT_SOURCE_DIR}/yolov8_5u_det.cpp ${SRCS}) target_link_libraries(yolov8_5u_det nvinfer cudart myplugins ${OpenCV_LIBS}) add_executable(yolov8_obb ${PROJECT_SOURCE_DIR}/yolov8_obb.cpp ${SRCS}) target_link_libraries(yolov8_obb nvinfer cudart myplugins ${OpenCV_LIBS}) ================================================ FILE: yolov8/README.md ================================================ # YOLOv8 The Pytorch implementation is [ultralytics/yolov8](https://github.com/ultralytics/ultralytics/tree/main/ultralytics). The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.com/xiaocao-tian/yolov8_tensorrt) ## Contributors ## Requirements - TensorRT 8.0+ - OpenCV 3.4.0+ - ultralytics<=8.2.103 ## Different versions of yolov8 Currently, we support yolov8 - For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page. ## Config - Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments. - Check more configs in [include/config.h](./include/config.h) ## How to Run, yolov8n as example 1. generate .wts from pytorch with .pt, or download .wts from model zoo ``` // download https://github.com/ultralytics/assets/releases/yolov8n.pt // download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model) cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics cd {ultralytics}/ultralytics python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect // a file 'yolov8n.wts' will be generated. // For p2 model // download https://github.com/lindsayshuo/yolov8_p2_tensorrtx/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model) cd {ultralytics}/ultralytics python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for 10 cls p2 model) // a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated. // For yolov8_5u_det model // download https://github.com/ultralytics/assets/releases/yolov5nu.pt cd {ultralytics}/ultralytics python gen_wts.py -w yolov5nu.pt -o yolov5nu.wts -t detect // a file 'yolov5nu.wts' will be generated. ``` 2. build tensorrtx/yolov8 and run ### Detection ``` cd {tensorrtx}/yolov8/ mkdir build cd build cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build cmake .. make sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file sudo ./yolov8_det -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8n sudo ./yolov8_det -s yolov8n.wts yolov8.engine n sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess // For p2 model: // change the "const static int kNumClass" in config.h to 10; sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2 wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg cp -r 0000008_01999_d_0000040.jpg ../images sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess // For yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model: sudo ./yolov8_5u_det -s [.wts] [.engine] [n/s/m/l/x//n6/s6/m6/l6/x6] sudo ./yolov8_5u_det -d yolov5xu.engine ../images c //cpu postprocess sudo ./yolov8_5u_det -d yolov5xu.engine ../images g //gpu postprocess ``` ### Instance Segmentation ``` # Build and serialize TensorRT engine ./yolov8_seg -s yolov8s-seg.wts yolov8s-seg.engine s # Download the labels file wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt # Run inference with labels file ./yolov8_seg -d yolov8s-seg.engine ../images c coco.txt ``` ### Classification ``` cd {tensorrtx}/yolov8/ // Download inference images wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/1709970363.6990473rescls.jpg mkdir samples cp -r 1709970363.6990473rescls.jpg samples // Download ImageNet labels wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt // update kClsNumClass in config.h if your model is trained on custom dataset mkdir build cd build cp {ultralytics}/ultralytics/yolov8n-cls.wts {tensorrtx}/yolov8/build cmake .. make sudo ./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file sudo ./yolov8_cls -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8n sudo ./yolov8_cls -s yolov8n-cls.wts yolov8-cls.engine n sudo ./yolov8_cls -d yolov8n-cls.engine ../samples ``` ### Pose Estimation ``` cd {tensorrtx}/yolov8/ // update "kPoseNumClass = 1" in config.h mkdir build cd build cp {ultralytics}/ultralytics/yolov8-pose.wts {tensorrtx}/yolov8/build cmake .. make sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file sudo ./yolov8_pose -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8-pose sudo ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess ``` ### Oriented Bounding Boxes (OBB) Estimation ``` cd {tensorrtx}/yolov8/ // update "kObbNumClass = 15" "kInputH = 1024" "kInputW = 1024" in config.h wget https://github.com/lindsayshuo/infer_pic/releases/download/pics/obb.png mkdir images mv obb.png ./images mkdir build cd build cp {ultralytics}/ultralytics/yolov8-obb.wts {tensorrtx}/yolov8/build cmake .. make sudo ./yolov8_obb -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file sudo ./yolov8_obb -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8-obb sudo ./yolov8_obb -s yolov8n-obb.wts yolov8n-obb.engine n sudo ./yolov8_obb -d yolov8n-obb.engine ../images c //cpu postprocess sudo ./yolov8_obb -d yolov8n-obb.engine ../images g //gpu postprocess ``` 4. optional, load and run the tensorrt model in python ``` // install python-tensorrt, pycuda, etc. // ensure the yolov8n.engine and libmyplugins.so have been built python yolov8_det_trt.py # Detection python yolov8_seg_trt.py # Segmentation python yolov8_cls_trt.py # Classification python yolov8_pose_trt.py # Pose Estimation python yolov8_5u_det_trt.py # yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model python yolov8_obb_trt.py # Oriented Bounding Boxes (OBB) Estimation ``` # INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolov8/build 3. set the macro `USE_INT8` in config.h, change `kInputQuantizationFolder` into your image folder path and make 4. serialize the model and test

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov8/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose', 'obb'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') # Initialize device = 'cpu' # Load model model = torch.load(pt_file, map_location=device, weights_only=False) # Load FP32 weights model = model['ema' if model.get('ema') else 'model'].float() if m_type in ['detect', 'seg', 'pose', 'obb']: anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] delattr(model.model[-1], 'anchors') model.to(device).eval() with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') ================================================ FILE: yolov8/include/block.h ================================================ #pragma once #include #include #include #include "NvInfer.h" int calculateP(int ksize); std::map loadWeights(const std::string file); nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname); nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, int num_class, bool is_segmentation, bool is_pose, bool is_obb); ================================================ FILE: yolov8/include/calibrator.h ================================================ #ifndef ENTROPY_CALIBRATOR_H #define ENTROPY_CALIBRATOR_H #include #include #include #include "macros.h" //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; #endif // ENTROPY_CALIBRATOR_H ================================================ FILE: yolov8/include/config.h ================================================ #define USE_FP16 //#define USE_FP32 //#define USE_INT8 const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static int kNumClass = 80; const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; const static float kConfThreshKeypoints = 0.5f; // keypoints confidence const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; //Quantization input image folder path const static char* kInputQuantizationFolder = "./coco_calib"; // Classfication model's number of classes constexpr static int kClsNumClass = 1000; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; // pose model's number of classes constexpr static int kPoseNumClass = 1; const static int kNumberOfPoints = 17; // number of keypoints total // obb model's number of classes constexpr static int kObbNumClass = 15; ================================================ FILE: yolov8/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr) \ { \ cudaError_t error_code = callstr; \ if (error_code != cudaSuccess) { \ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ assert(0); \ } \ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov8/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include #include #include #include #include #include #include #include "NvInferRuntimeCommon.h" #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream), mPrefix(prefix), mShouldLog(shouldLog) {} LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) {} ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) {} protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity), mSeverity(severity) {} LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog), std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog), mSeverity(other.mSeverity) {} void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) {} //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started), mName(name), mCmdline(cmdline) {} bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov8/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include "NvInfer.h" #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov8/include/model.h ================================================ #pragma once #include #include #include "NvInfer.h" nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw); nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); nvinfer1::IHostMemory* buildEngineYolov8Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); ================================================ FILE: yolov8/include/postprocess.h ================================================ #pragma once #include #include "NvInfer.h" #include "types.h" // Preprocessing functions cv::Rect get_rect(cv::Mat& img, float bbox[4]); // Processing functions void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); // NMS functions void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); void batch_nms_obb(std::vector>& batch_res, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); // CUDA-related functions void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream); void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); // Drawing functions void draw_bbox(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch); void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); ================================================ FILE: yolov8/include/preprocess.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "types.h" void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov8/include/types.h ================================================ #pragma once #include "config.h" struct alignas(float) Detection { //center_x center_y w h float bbox[4]; float conf; // bbox_conf * cls_conf float class_id; float mask[32]; float keypoints[kNumberOfPoints * 3]; // keypoints array with dynamic size based on kNumberOfPoints float angle; // obb angle }; struct AffineMatrix { float value[6]; }; const int bbox_element = sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag ================================================ FILE: yolov8/include/utils.h ================================================ #pragma once #include #include #include static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov8/plugin/yololayer.cu ================================================ #include #include #include #include #include "cuda_utils.h" #include "types.h" #include "yololayer.h" namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn __device__ float sigmoid(float x) { return 1.0f / (1.0f + exp(-x)); } namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength) { mClassCount = classCount; mNumberofpoints = numberofpoints; mConfthreshkeypoints = confthreshkeypoints; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; mStridesLength = stridesLength; mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; is_pose_ = is_pose; is_obb_ = is_obb; } YoloLayerPlugin::~YoloLayerPlugin() { if (mStrides != nullptr) { delete[] mStrides; mStrides = nullptr; } } YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mNumberofpoints); read(d, mConfthreshkeypoints); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); read(d, mStridesLength); mStrides = new int[mStridesLength]; for (int i = 0; i < mStridesLength; ++i) { read(d, mStrides[i]); } read(d, is_segmentation_); read(d, is_pose_); read(d, is_obb_); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mNumberofpoints); write(d, mConfthreshkeypoints); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); write(d, mStridesLength); for (int i = 0; i < mStridesLength; ++i) { write(d, mStrides[i]); } write(d, is_segmentation_); write(d, is_pose_); write(d, is_obb_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_) + sizeof(is_obb_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_, is_pose_, is_obb_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, bool is_segmentation, bool is_pose, bool is_obb) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; const int N_kpts = nk; int total_grid = grid_h * grid_w; int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0); int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; if (is_segmentation) { for (int k = 0; k < 32; ++k) { det->mask[k] = curInput[elemIdx + (4 + classes + (is_pose ? N_kpts * 3 : 0) + (is_obb ? 1 : 0) + k) * total_grid]; } } if (is_pose) { for (int kpt = 0; kpt < N_kpts; kpt++) { int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3) * total_grid; int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 1) * total_grid; int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + (is_obb ? 1 : 0) + kpt * 3 + 2) * total_grid; float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; bool is_within_bbox = kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; if (kpt_confidence < confkeypoints || !is_within_bbox) { det->keypoints[kpt * 3] = -1; det->keypoints[kpt * 3 + 1] = -1; det->keypoints[kpt * 3 + 2] = -1; } else { det->keypoints[kpt * 3] = kpt_x; det->keypoints[kpt * 3 + 1] = kpt_y; det->keypoints[kpt * 3 + 2] = kpt_confidence; } } } if (is_obb) { double pi = M_PI; auto angle_inx = curInput[elemIdx + (4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0) + 0) * total_grid]; auto angle = (sigmoid(angle_inx) - 0.25f) * pi; auto cos1 = cos(angle); auto sin1 = sin(angle); auto xf = (curInput[elemIdx + 2 * total_grid] - curInput[elemIdx + 0 * total_grid]) / 2; auto yf = (curInput[elemIdx + 3 * total_grid] - curInput[elemIdx + 1 * total_grid]) / 2; auto x = xf * cos1 - yf * sin1; auto y = xf * sin1 + yf * cos1; float cx = (col + 0.5f + x) * stride; float cy = (row + 0.5f + y) * stride; float w1 = (curInput[elemIdx + 0 * total_grid] + curInput[elemIdx + 2 * total_grid]) * stride; float h1 = (curInput[elemIdx + 1 * total_grid] + curInput[elemIdx + 3 * total_grid]) * stride; det->bbox[0] = cx; det->bbox[1] = cy; det->bbox[2] = w1; det->bbox[3] = h1; det->angle = angle; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; int maxGrids = mStridesLength; int flatGridsLen = 2 * maxGrids; int* flatGrids = new int[flatGridsLen]; for (int i = 0; i < maxGrids; ++i) { flatGrids[2 * i] = mYoloV8netHeight / mStrides[i]; flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i]; } for (unsigned int i = 0; i < maxGrids; i++) { // Access the elements of the original 2D array from the flattened 1D array int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements if (numElem < mThreadCount) // Adjust the thread count if needed mThreadCount = numElem; // The CUDA kernel call remains unchanged CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_, is_obb_); } delete[] flatGrids; } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); int netinfo_count = 9; int class_count = combinedInfo[0]; int numberofpoints = combinedInfo[1]; float confthreshkeypoints = combinedInfo[2]; int input_w = combinedInfo[3]; int input_h = combinedInfo[4]; int max_output_object_count = combinedInfo[5]; bool is_segmentation = combinedInfo[6]; bool is_pose = combinedInfo[7]; bool is_obb = combinedInfo[8]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, max_output_object_count, is_segmentation, is_pose, is_obb, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov8/plugin/yololayer.h ================================================ #pragma once #include #include #include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, int maxOut, bool is_segmentation, bool is_pose, bool is_obb, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; int mNumberofpoints; float mConfthreshkeypoints; int mYoloV8NetWidth; int mYoloV8netHeight; int mMaxOutObject; bool is_segmentation_; bool is_pose_; bool is_obb_; int* mStrides; int mStridesLength; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov8/src/block.cpp ================================================ #include "block.h" #include #include #include #include #include "config.h" #include "yololayer.h" int calculateP(int ksize) { return ksize / 3; } std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map WeightMap; std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the " ".wts file path is right!!!!!!"); int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; uint32_t size; std::string name; input >> name >> std::dec >> size; wt.type = nvinfer1::DataType::kFLOAT; uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; x++) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; WeightMap[name] = wt; } return WeightMap; } static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) { nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1"); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2"); if (shortcut && c1 == c2) { nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return conv2; } static nvinfer1::ILayer* bottleneck_c3(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname) { nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, (int)((float)c2 * e), 1, 1, calculateP(1), lname + ".cv1"); nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, 3, 1, calculateP(3), lname + ".cv2"); if (shortcut && c1 == c2) { auto ew = network->addElementWise(input, *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); return ew; } return cv2; } nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname) { int c_ = (float)c2 * e; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1"); nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); nvinfer1::ITensor* y1 = split2->getOutput(0); for (int i = 0; i < n; i++) { auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; cat = network->addConcatenation(inputTensors, 2); } nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2"); return conv2; } nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname) { assert(network != nullptr); int hidden_channels = static_cast(c2 * e); // cv1 branch nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1"); nvinfer1::ITensor* cv1_out = conv1->getOutput(0); // Split the output of cv1 into two tensors nvinfer1::Dims dims = cv1_out->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1_out, nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1_out, nvinfer1::Dims3{dims.d[0] / 2, 0, 0}, nvinfer1::Dims3{dims.d[0] / 2, dims.d[1], dims.d[2]}, nvinfer1::Dims3{1, 1, 1}); // Create y1 bottleneck sequence nvinfer1::ITensor* y1 = split1->getOutput(0); for (int i = 0; i < n; ++i) { auto* bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0, lname + ".m." + std::to_string(i)); y1 = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck } // Concatenate y1 with the second split of cv1 nvinfer1::ITensor* concatInputs[2] = {y1, split2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(concatInputs, 2); // cv2 to produce the final output nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2"); return conv2; } nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname) { int c_ = (float)c2 * e; nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv1"); nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv2"); nvinfer1::ITensor* y1 = cv1->getOutput(0); for (int i = 0; i < n; i++) { auto b = bottleneck_c3(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i)); y1 = b->getOutput(0); } nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, calculateP(1), lname + ".cv3"); return conv3; } nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname) { int c_ = c1 / 2; nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1"); nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::IPoolingLayer* pool3 = network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4); nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2"); return conv2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname) { nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims3{4, 16, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, const int* px_arry, int px_arry_num, int num_class, bool is_segmentation, bool is_pose, bool is_obb) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); const int netinfo_count = 9; // Assuming the first 5 elements are for netinfo as per existing code. const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. std::vector combinedInfo(total_count); // Fill in the first 5 elements as per existing netinfo. combinedInfo[0] = num_class; combinedInfo[1] = kNumberOfPoints; combinedInfo[2] = kConfThreshKeypoints; combinedInfo[3] = kInputW; combinedInfo[4] = kInputH; combinedInfo[5] = kMaxNumOutputBbox; combinedInfo[6] = is_segmentation; combinedInfo[7] = is_pose; combinedInfo[8] = is_obb; // Copy the contents of px_arry into the combinedInfo vector after the initial // 5 elements. std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); // Now let's create the PluginField object to hold this combined information. nvinfer1::PluginField pluginField; pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize pluginField.data = combinedInfo.data(); pluginField.type = nvinfer1::PluginFieldType::kINT32; pluginField.length = combinedInfo.size(); // Create the PluginFieldCollection to hold the PluginField object. nvinfer1::PluginFieldCollection pluginFieldCollection; pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array pluginFieldCollection.fields = &pluginField; // Create the plugin object using the PluginFieldCollection. nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); // We assume that the plugin is to be added onto the network. // Prepare input tensors for the YOLO Layer. std::vector inputTensors; for (auto det : dets) { inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. } // Add the plugin to the network using the prepared input tensors. nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); return yoloLayer; // Return the added YOLO layer. } ================================================ FILE: yolov8/src/calibrator.cpp ================================================ #include "calibrator.h" #include #include #include #include #include "cuda_utils.h" #include "utils.h" Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov8/src/model.cpp ================================================ #include #include #include "block.h" #include "calibrator.h" #include "config.h" #include "model.h" static int get_width_5u(int x, float gw, int divisor = 8) { return int(ceil((x * gw) / divisor)) * divisor; } static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = int(ceil((x * gw) / divisor)) * divisor; return channel >= max_channels ? max_channels : channel; } static int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) --r; return std::max(r, 1); } void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { for (int i = 0; i < size; ++i) { nvinfer1::ILayer* layer = conv_layers[i]; nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); int feature_map_size = dims.d[1]; strides[i] = reference_size / feature_map_size; } } static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, float gw, int max_channels) { int mid_channel = get_width(256, gw, max_channels); auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, 3, 1, 1, "model.22.proto.cv1"); float* convTranpsose_bais = (float*)weightMap["model.22.proto.upsample.bias"].values; int convTranpsose_bais_len = weightMap["model.22.proto.upsample.bias"].count; nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len}; auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2}, weightMap["model.22.proto.upsample.weight"], bias); assert(convTranpsose); convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2}); auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, 3, 1, 1, "model.22.proto.cv2"); auto cv3 = convBnSiLU(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, "model.22.proto.cv3"); assert(cv3); return cv3; } static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw, std::string algo_type) { int mid_channle = 0; int output_channel = 0; if (algo_type == "seg") { if (gw == 0.25 || gw == 0.5) { mid_channle = 32; } else if (gw == 0.75) { mid_channle = 48; } else if (gw == 1.00) { mid_channle = 64; } else if (gw == 1.25) { mid_channle = 80; } output_channel = 32; } else if (algo_type == "pose") { std::string bn_weight_key = lname + ".0.bn.weight"; mid_channle = weightMap[bn_weight_key].count; output_channel = kNumberOfPoints * 3; } else if (algo_type == "obb") { std::string bn_weight_key = lname + ".0.bn.weight"; mid_channle = weightMap[bn_weight_key].count; output_channel = 1; } auto cv0 = convBnSiLU(network, weightMap, input, mid_channle, 3, 1, 1, lname + ".0"); auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), mid_channle, 3, 1, 1, lname + ".1"); float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values; int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count; nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len}; auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), output_channel, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".2" + ".weight"], cv2_bais); cv2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0)); cv2_shuffle->setReshapeDimensions(nvinfer1::Dims2{output_channel, grid_shape}); return cv2_shuffle; } nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); // 11233 nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ********************************************* *******************************************************************************************************/ float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); assert(upsample10); upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample10->setScales(scale, 3); nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); assert(upsample13); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample13->setScales(scale, 3); nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); nvinfer1::IElementWiseLayer* conv15 = C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.16"); nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2); nvinfer1::IElementWiseLayer* conv18 = C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.19"); nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2); nvinfer1::IElementWiseLayer* conv21 = C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv22_cv2_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); nvinfer1::IElementWiseLayer* conv22_cv2_0_1 = convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); nvinfer1::IConvolutionLayer* conv22_cv2_0_2 = network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); nvinfer1::IConvolutionLayer* conv22_cv3_0_2 = network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2); // output1 nvinfer1::IElementWiseLayer* conv22_cv2_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); nvinfer1::IElementWiseLayer* conv22_cv2_1_1 = convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); nvinfer1::IConvolutionLayer* conv22_cv2_1_2 = network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); nvinfer1::IConvolutionLayer* conv22_cv3_1_2 = network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2); // output2 nvinfer1::IElementWiseLayer* conv22_cv2_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); nvinfer1::IElementWiseLayer* conv22_cv2_2_1 = convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); nvinfer1::IConvolutionLayer* conv22_cv2_2_2 = network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); nvinfer1::IElementWiseLayer* conv22_cv3_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); nvinfer1::IConvolutionLayer* conv22_cv3_2_2 = network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2); nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2); nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, kNumClass, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); // 11233 nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9"); nvinfer1::IElementWiseLayer* conv10 = C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10"); nvinfer1::IElementWiseLayer* conv11 = SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.11"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ********************************************* *******************************************************************************************************/ // Head float scale[] = {1.0, 2.0, 2.0}; // scale used for upsampling // P5 nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0)); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 3); nvinfer1::ITensor* concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)}; nvinfer1::IConcatenationLayer* concat13 = network->addConcatenation(concat13_inputs, 2); nvinfer1::IElementWiseLayer* conv14 = C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14"); // P4 nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0)); upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample15->setScales(scale, 3); nvinfer1::ITensor* concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* concat16 = network->addConcatenation(concat16_inputs, 2); nvinfer1::IElementWiseLayer* conv17 = C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17"); // P3 nvinfer1::IResizeLayer* upsample18 = network->addResize(*conv17->getOutput(0)); upsample18->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample18->setScales(scale, 3); nvinfer1::ITensor* concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* concat19 = network->addConcatenation(concat19_inputs, 2); nvinfer1::IElementWiseLayer* conv20 = C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20"); // Additional layers for P4, P5, P6 // P4/16-medium nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.21"); nvinfer1::ITensor* concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)}; nvinfer1::IConcatenationLayer* concat22 = network->addConcatenation(concat22_inputs, 2); nvinfer1::IElementWiseLayer* conv23 = C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23"); // P5/32-large nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.24"); nvinfer1::ITensor* concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)}; nvinfer1::IConcatenationLayer* concat25 = network->addConcatenation(concat25_inputs, 2); nvinfer1::IElementWiseLayer* conv26 = C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26"); // P6/64-xlarge nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.27"); nvinfer1::ITensor* concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)}; nvinfer1::IConcatenationLayer* concat28 = network->addConcatenation(concat28_inputs, 2); nvinfer1::IElementWiseLayer* conv29 = C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv30_cv2_0_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0"); nvinfer1::IElementWiseLayer* conv30_cv2_0_1 = convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1"); nvinfer1::IConvolutionLayer* conv30_cv2_0_2 = network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]); conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_0_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0"); nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.1"); nvinfer1::IConvolutionLayer* conv30_cv3_0_2 = network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]); conv30_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv30_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_0 = network->addConcatenation(inputTensor30_0, 2); // output1 nvinfer1::IElementWiseLayer* conv30_cv2_1_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0"); nvinfer1::IElementWiseLayer* conv30_cv2_1_1 = convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1"); nvinfer1::IConvolutionLayer* conv30_cv2_1_2 = network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]); conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_1_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0"); nvinfer1::IElementWiseLayer* conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.1"); nvinfer1::IConvolutionLayer* conv30_cv3_1_2 = network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]); conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_1 = network->addConcatenation(inputTensor30_1, 2); // output2 nvinfer1::IElementWiseLayer* conv30_cv2_2_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0"); nvinfer1::IElementWiseLayer* conv30_cv2_2_1 = convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1"); nvinfer1::IConvolutionLayer* conv30_cv2_2_2 = network->addConvolution(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]); conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_2_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0"); nvinfer1::IElementWiseLayer* conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.1"); nvinfer1::IConvolutionLayer* conv30_cv3_2_2 = network->addConvolution(*conv30_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]); conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_2 = network->addConcatenation(inputTensor30_2, 2); // output3 nvinfer1::IElementWiseLayer* conv30_cv2_3_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0"); nvinfer1::IElementWiseLayer* conv30_cv2_3_1 = convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1"); nvinfer1::IConvolutionLayer* conv30_cv2_3_2 = network->addConvolution(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]); conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_3_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0"); nvinfer1::IElementWiseLayer* conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.1"); nvinfer1::IConvolutionLayer* conv30_cv3_3_2 = network->addConvolution(*conv30_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]); conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // P3 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_0 = network->addShuffle(*cat30_0->getOutput(0)); // Reusing the previous cat30_0 as P3 concatenation layer shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split30_0_0 = network->addSlice( *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_0_1 = network->addSlice( *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_0 = DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2); // P4 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_1 = network->addShuffle(*cat30_1->getOutput(0)); // Reusing the previous cat30_1 as P4 concatenation layer shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split30_1_0 = network->addSlice( *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_1_1 = network->addSlice( *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_1 = DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2); // P5 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_2 = network->addShuffle(*cat30_2->getOutput(0)); // Reusing the previous cat30_2 as P5 concatenation layer shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split30_2_0 = network->addSlice( *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_2_1 = network->addSlice( *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_2 = DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2); // P6 processing steps nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0)); shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); nvinfer1::ISliceLayer* split30_3_0 = network->addSlice( *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_3_1 = network->addSlice( *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_3 = DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, strides, stridesLength, kNumClass, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); // 11233 nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ********************************************* *******************************************************************************************************/ // Head float scale[] = {1.0, 2.0, 2.0}; // scale used for upsampling // P4 nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); // Assuming conv9 is the last layer of the backbone // as per P5 in your first section. upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample10->setScales(scale, 3); nvinfer1::ITensor* concat11_inputs[] = {upsample10->getOutput(0), conv6->getOutput(0)}; // Assuming conv6 corresponds to "backbone P4" as // per your pseudocode nvinfer1::IConcatenationLayer* concat11 = network->addConcatenation(concat11_inputs, 2); nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); // P3 nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample13->setScales(scale, 3); nvinfer1::ITensor* concat14_inputs[] = {upsample13->getOutput(0), conv4->getOutput(0)}; // Assuming conv4 corresponds to "backbone P3" nvinfer1::IConcatenationLayer* concat14 = network->addConcatenation(concat14_inputs, 2); nvinfer1::IElementWiseLayer* conv15 = C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); // P2 nvinfer1::IResizeLayer* upsample16 = network->addResize(*conv15->getOutput(0)); upsample16->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample16->setScales(scale, 3); nvinfer1::ITensor* concat17_inputs[] = {upsample16->getOutput(0), conv2->getOutput(0)}; // Assuming conv2 corresponds to "backbone P2" nvinfer1::IConcatenationLayer* concat17 = network->addConcatenation(concat17_inputs, 2); nvinfer1::IElementWiseLayer* conv18 = C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); // Additional layers for P3, P4, P5 // Downsample and concatenate for P3 nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.19"); nvinfer1::ITensor* concat20_inputs[] = { conv19->getOutput(0), conv15->getOutput(0)}; // concatenate with higher-resolution feature map from P3 nvinfer1::IConcatenationLayer* concat20 = network->addConcatenation(concat20_inputs, 2); nvinfer1::IElementWiseLayer* conv21 = C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); // Downsample and concatenate for P4 nvinfer1::IElementWiseLayer* conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.22"); nvinfer1::ITensor* concat23_inputs[] = { conv22->getOutput(0), conv12->getOutput(0)}; // concatenate with higher-resolution feature map from P4 nvinfer1::IConcatenationLayer* concat23 = network->addConcatenation(concat23_inputs, 2); nvinfer1::IElementWiseLayer* conv24 = C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24"); // Downsample and concatenate for P5 nvinfer1::IElementWiseLayer* conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.25"); nvinfer1::ITensor* concat26_inputs[] = { conv25->getOutput(0), conv9->getOutput(0)}; // concatenate with higher-resolution feature map from P5 nvinfer1::IConcatenationLayer* concat26 = network->addConcatenation(concat26_inputs, 2); nvinfer1::IElementWiseLayer* conv27 = C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv28_cv2_0_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0"); nvinfer1::IElementWiseLayer* conv28_cv2_0_1 = convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1"); nvinfer1::IConvolutionLayer* conv28_cv2_0_2 = network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]); conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv28_cv3_0_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0"); nvinfer1::IElementWiseLayer* conv28_cv3_0_1 = convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.1"); nvinfer1::IConvolutionLayer* conv28_cv3_0_2 = network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]); conv28_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv28_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_0 = network->addConcatenation(inputTensor28_0, 2); // output1 nvinfer1::IElementWiseLayer* conv28_cv2_1_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0"); nvinfer1::IElementWiseLayer* conv28_cv2_1_1 = convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1"); nvinfer1::IConvolutionLayer* conv28_cv2_1_2 = network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]); conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv28_cv3_1_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0"); nvinfer1::IElementWiseLayer* conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.1"); nvinfer1::IConvolutionLayer* conv28_cv3_1_2 = network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]); conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_1 = network->addConcatenation(inputTensor28_1, 2); // output2 nvinfer1::IElementWiseLayer* conv28_cv2_2_0 = convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0"); nvinfer1::IElementWiseLayer* conv28_cv2_2_1 = convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1"); nvinfer1::IConvolutionLayer* conv28_cv2_2_2 = network->addConvolution(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]); conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv28_cv3_2_0 = convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0"); nvinfer1::IElementWiseLayer* conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.1"); nvinfer1::IConvolutionLayer* conv28_cv3_2_2 = network->addConvolution(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]); conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_2 = network->addConcatenation(inputTensor28_2, 2); // output3 nvinfer1::IElementWiseLayer* conv28_cv2_3_0 = convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0"); nvinfer1::IElementWiseLayer* conv28_cv2_3_1 = convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1"); nvinfer1::IConvolutionLayer* conv28_cv2_3_2 = network->addConvolution(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]); conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv28_cv3_3_0 = convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0"); nvinfer1::IElementWiseLayer* conv28_cv3_3_1 = convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.1"); nvinfer1::IConvolutionLayer* conv28_cv3_3_2 = network->addConvolution(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]); conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_3 = network->addConcatenation(inputTensor28_3, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv1, conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // P2 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0)); shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split28_0_0 = network->addSlice( *shuffle28_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split28_0_1 = network->addSlice( *shuffle28_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl28_0 = DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.28.dfl.conv.weight"); nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2); // P3 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0)); shuffle28_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split28_1_0 = network->addSlice( *shuffle28_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split28_1_1 = network->addSlice( *shuffle28_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl28_1 = DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.28.dfl.conv.weight"); nvinfer1::ITensor* inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2); // P4 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0)); shuffle28_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split28_2_0 = network->addSlice( *shuffle28_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split28_2_1 = network->addSlice( *shuffle28_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl28_2 = DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.28.dfl.conv.weight"); nvinfer1::ITensor* inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2); // P5 processing steps nvinfer1::IShuffleLayer* shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0)); shuffle28_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); nvinfer1::ISliceLayer* split28_3_0 = network->addSlice( *shuffle28_3->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split28_3_1 = network->addSlice( *shuffle28_3->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl28_3 = DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, 1, 0, "model.28.dfl.conv.weight"); nvinfer1::ITensor* inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3}, strides, stridesLength, kNumClass, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); int max_channels = 1280; // ****************************************** YOLOV8 INPUT // ********************************************** nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kClsInputH, kClsInputW}); assert(data); // ***************************************** YOLOV8 BACKBONE // ******************************************** nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); // C2 Block (11233) nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); // C2 Block Sequence (22466) nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); // C2 Block Sequence (22466) nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); // C2 Block (11233) nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); // ********************************************* YOLOV8 HEAD // ********************************************* auto conv_class = convBnSiLU(network, weightMap, *conv8->getOutput(0), 1280, 1, 1, 1, "model.9.conv"); // Adjusted code nvinfer1::Dims dims = conv_class->getOutput(0)->getDimensions(); // Obtain the dimensions of the // output of conv_class assert(dims.nbDims == 3); // Make sure there are exactly 3 dimensions (channels, height, width) nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE, nvinfer1::DimsHW{dims.d[1], dims.d[2]}); assert(pool2); // Fully connected layer declaration nvinfer1::IFullyConnectedLayer* yolo = network->addFullyConnected( *pool2->getOutput(0), kClsNumClass, weightMap["model.9.linear.weight"], weightMap["model.9.linear.bias"]); assert(yolo); // Set the name for the output tensor and mark it as network output yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); // Set the maximum batch size and workspace size builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); // Configuration according to the precision mode being used #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif // Begin building the engine; this may take a while std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; // Cleanup the network definition and allocated weights delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ********************************************* *******************************************************************************************************/ float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); assert(upsample10); upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample10->setScales(scale, 3); nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); assert(upsample13); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample13->setScales(scale, 3); nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); nvinfer1::IElementWiseLayer* conv15 = C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.16"); nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2); nvinfer1::IElementWiseLayer* conv18 = C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.19"); nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2); nvinfer1::IElementWiseLayer* conv21 = C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv22_cv2_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); nvinfer1::IElementWiseLayer* conv22_cv2_0_1 = convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); nvinfer1::IConvolutionLayer* conv22_cv2_0_2 = network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); nvinfer1::IConvolutionLayer* conv22_cv3_0_2 = network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2); // output1 nvinfer1::IElementWiseLayer* conv22_cv2_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); nvinfer1::IElementWiseLayer* conv22_cv2_1_1 = convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); nvinfer1::IConvolutionLayer* conv22_cv2_1_2 = network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); nvinfer1::IConvolutionLayer* conv22_cv3_1_2 = network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2); // output2 nvinfer1::IElementWiseLayer* conv22_cv2_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); nvinfer1::IElementWiseLayer* conv22_cv2_2_1 = convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); nvinfer1::IConvolutionLayer* conv22_cv2_2_2 = network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); nvinfer1::IElementWiseLayer* conv22_cv3_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); nvinfer1::IConvolutionLayer* conv22_cv3_2_2 = network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.22.dfl.conv.weight"); // det0 auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), proto_coef_0->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); // det1 auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), proto_coef_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); // det2 auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), proto_coef_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, kNumClass, true, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); auto proto = Proto(network, weightMap, *conv15->getOutput(0), "model.22.proto", gw, max_channels); proto->getOutput(0)->setName("proto"); network->markOutput(*proto->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ********************************************* *******************************************************************************************************/ float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); assert(upsample10); upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample10->setScales(scale, 3); nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); assert(upsample13); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample13->setScales(scale, 3); nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); nvinfer1::IElementWiseLayer* conv15 = C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.16"); nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2); nvinfer1::IElementWiseLayer* conv18 = C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.19"); nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2); nvinfer1::IElementWiseLayer* conv21 = C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv22_cv2_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); nvinfer1::IElementWiseLayer* conv22_cv2_0_1 = convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); nvinfer1::IConvolutionLayer* conv22_cv2_0_2 = network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); nvinfer1::IConvolutionLayer* conv22_cv3_0_2 = network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2); // output1 nvinfer1::IElementWiseLayer* conv22_cv2_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); nvinfer1::IElementWiseLayer* conv22_cv2_1_1 = convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); nvinfer1::IConvolutionLayer* conv22_cv2_1_2 = network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); nvinfer1::IConvolutionLayer* conv22_cv3_1_2 = network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2); // output2 nvinfer1::IElementWiseLayer* conv22_cv2_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); nvinfer1::IElementWiseLayer* conv22_cv2_2_1 = convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); nvinfer1::IConvolutionLayer* conv22_cv2_2_2 = network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); nvinfer1::IElementWiseLayer* conv22_cv3_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); nvinfer1::IConvolutionLayer* conv22_cv3_2_2 = network->addConvolution(*conv22_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); /**************************************************************************************P3****************************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); shuffle22_0->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.22.dfl.conv.weight"); // det0 auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), shuffle_conv15->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); /********************************************************************************************P4**********************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); shuffle22_1->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.22.dfl.conv.weight"); // det1 auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), shuffle_conv18->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); /********************************************************************************************P5**********************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); shuffle22_2->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.22.dfl.conv.weight"); // det2 auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), shuffle_conv21->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, kPoseNumClass, false, true, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); // 11233 nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9"); nvinfer1::IElementWiseLayer* conv10 = C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10"); nvinfer1::IElementWiseLayer* conv11 = SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.11"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ********************************************* *******************************************************************************************************/ // Head float scale[] = {1.0, 2.0, 2.0}; // scale used for upsampling // P5 nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0)); upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample12->setScales(scale, 3); nvinfer1::ITensor* concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)}; nvinfer1::IConcatenationLayer* concat13 = network->addConcatenation(concat13_inputs, 2); nvinfer1::IElementWiseLayer* conv14 = C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14"); // P4 nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0)); upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample15->setScales(scale, 3); nvinfer1::ITensor* concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* concat16 = network->addConcatenation(concat16_inputs, 2); nvinfer1::IElementWiseLayer* conv17 = C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17"); // P3 nvinfer1::IResizeLayer* upsample18 = network->addResize(*conv17->getOutput(0)); upsample18->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample18->setScales(scale, 3); nvinfer1::ITensor* concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* concat19 = network->addConcatenation(concat19_inputs, 2); nvinfer1::IElementWiseLayer* conv20 = C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20"); // Additional layers for P4, P5, P6 // P4/16-medium nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.21"); nvinfer1::ITensor* concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)}; nvinfer1::IConcatenationLayer* concat22 = network->addConcatenation(concat22_inputs, 2); nvinfer1::IElementWiseLayer* conv23 = C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23"); // P5/32-large nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.24"); nvinfer1::ITensor* concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)}; nvinfer1::IConcatenationLayer* concat25 = network->addConcatenation(concat25_inputs, 2); nvinfer1::IElementWiseLayer* conv26 = C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26"); // P6/64-xlarge nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.27"); nvinfer1::ITensor* concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)}; nvinfer1::IConcatenationLayer* concat28 = network->addConcatenation(concat28_inputs, 2); nvinfer1::IElementWiseLayer* conv29 = C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv30_cv2_0_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0"); nvinfer1::IElementWiseLayer* conv30_cv2_0_1 = convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1"); nvinfer1::IConvolutionLayer* conv30_cv2_0_2 = network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]); conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_0_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0"); nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.1"); nvinfer1::IConvolutionLayer* conv30_cv3_0_2 = network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]); conv30_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv30_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_0 = network->addConcatenation(inputTensor30_0, 2); // output1 nvinfer1::IElementWiseLayer* conv30_cv2_1_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0"); nvinfer1::IElementWiseLayer* conv30_cv2_1_1 = convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1"); nvinfer1::IConvolutionLayer* conv30_cv2_1_2 = network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]); conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_1_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0"); nvinfer1::IElementWiseLayer* conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.1"); nvinfer1::IConvolutionLayer* conv30_cv3_1_2 = network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]); conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_1 = network->addConcatenation(inputTensor30_1, 2); // output2 nvinfer1::IElementWiseLayer* conv30_cv2_2_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0"); nvinfer1::IElementWiseLayer* conv30_cv2_2_1 = convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1"); nvinfer1::IConvolutionLayer* conv30_cv2_2_2 = network->addConvolution(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]); conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_2_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0"); nvinfer1::IElementWiseLayer* conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.1"); nvinfer1::IConvolutionLayer* conv30_cv3_2_2 = network->addConvolution(*conv30_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]); conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_2 = network->addConcatenation(inputTensor30_2, 2); // output3 nvinfer1::IElementWiseLayer* conv30_cv2_3_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0"); nvinfer1::IElementWiseLayer* conv30_cv2_3_1 = convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1"); nvinfer1::IConvolutionLayer* conv30_cv2_3_2 = network->addConvolution(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]); conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv30_cv3_3_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0"); nvinfer1::IElementWiseLayer* conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.1"); nvinfer1::IConvolutionLayer* conv30_cv3_3_2 = network->addConvolution(*conv30_cv3_3_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]); conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // P3 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_0 = network->addShuffle(*cat30_0->getOutput(0)); // Reusing the previous cat30_0 as P3 concatenation layer shuffle30_0->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split30_0_0 = network->addSlice( *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_0_1 = network->addSlice( *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_0 = DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.30.dfl.conv.weight"); // det0 auto shuffle_conv20 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.30.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0), shuffle_conv20->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 3); // P4 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_1 = network->addShuffle(*cat30_1->getOutput(0)); // Reusing the previous cat30_1 as P4 concatenation layer shuffle30_1->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split30_1_0 = network->addSlice( *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_1_1 = network->addSlice( *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_1 = DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.30.dfl.conv.weight"); // det1 auto shuffle_conv23 = cv4_conv_combined(network, weightMap, *conv23->getOutput(0), "model.30.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose"); nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0), shuffle_conv23->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 3); // P5 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_2 = network->addShuffle(*cat30_2->getOutput(0)); // Reusing the previous cat30_2 as P5 concatenation layer shuffle30_2->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split30_2_0 = network->addSlice( *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_2_1 = network->addSlice( *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_2 = DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.30.dfl.conv.weight"); // det2 auto shuffle_conv26 = cv4_conv_combined(network, weightMap, *conv26->getOutput(0), "model.30.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0), shuffle_conv26->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 3); // P6 processing steps nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0)); shuffle30_3->setReshapeDimensions( nvinfer1::Dims2{64 + kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); nvinfer1::ISliceLayer* split30_3_0 = network->addSlice( *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split30_3_1 = network->addSlice( *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl30_3 = DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, 1, 0, "model.30.dfl.conv.weight"); // det3 auto shuffle_conv29 = cv4_conv_combined(network, weightMap, *conv29->getOutput(0), "model.30.cv4.3", (kInputH / strides[3]) * (kInputW / strides[3]), gw, "pose"); nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0), shuffle_conv29->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 3); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, strides, stridesLength, kPoseNumClass, false, true, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV5U INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV5U BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width_5u(64, gw), 6, 2, calculateP(6), "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width_5u(128, gw), 3, 2, calculateP(3), "model.1"); // 11233 nvinfer1::IElementWiseLayer* conv2 = C3(network, weightMap, *conv1->getOutput(0), get_width_5u(128, gw), get_width_5u(128, gw), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3(network, weightMap, *conv3->getOutput(0), get_width_5u(256, gw), get_width_5u(256, gw), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3(network, weightMap, *conv5->getOutput(0), get_width_5u(512, gw), get_width_5u(512, gw), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width_5u(1024, gw), 3, 2, calculateP(3), "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3(network, weightMap, *conv7->getOutput(0), get_width_5u(1024, gw), get_width_5u(1024, gw), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width_5u(1024, gw), get_width_5u(1024, gw), 5, "model.9"); /******************************************************************************************************* ********************************************* YOLOV5U HEAD ********************************************* *******************************************************************************************************/ // auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0), // get_width_5u(512, gw), 1, 1, 1, "model.10"); //********************************************* cat backbone P4 //******************************************** nvinfer1::IElementWiseLayer* conv10 = convBnSiLU(network, weightMap, *conv9->getOutput(0), get_width_5u(512, gw), 1, 1, calculateP(1), "model.10"); nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0)); assert(upsample11); upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample11->setOutputDimensions(conv6->getOutput(0)->getDimensions()); nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2); nvinfer1::IElementWiseLayer* conv13 = C3(network, weightMap, *cat12->getOutput(0), get_width_5u(512, gw), get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.13"); //********************************************* cat backbone P4 //******************************************** //********************************************* cat backbone P3 //******************************************** nvinfer1::IElementWiseLayer* conv14 = convBnSiLU(network, weightMap, *conv13->getOutput(0), get_width_5u(256, gw), 1, 1, calculateP(1), "model.14"); nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0)); assert(upsample15); upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample15->setOutputDimensions(conv4->getOutput(0)->getDimensions()); nvinfer1::ITensor* inputTensor16[] = {upsample15->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensor16, 2); nvinfer1::IElementWiseLayer* conv17 = C3(network, weightMap, *cat16->getOutput(0), get_width_5u(256, gw), get_width_5u(256, gw), get_depth(3, gd), false, 0.5, "model.17"); //********************************************* cat backbone P3 //******************************************** //********************************************* cat head P4 //******************************************** nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.18"); nvinfer1::ITensor* inputTensor19[] = {conv18->getOutput(0), conv14->getOutput(0)}; nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensor19, 2); nvinfer1::IElementWiseLayer* conv20 = C3(network, weightMap, *cat19->getOutput(0), get_width_5u(512, gw), get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.20"); //********************************************* cat head P4 //******************************************** //********************************************* cat head P3 //******************************************** nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.21"); nvinfer1::ITensor* inputTensor22[] = {conv21->getOutput(0), conv10->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22 = network->addConcatenation(inputTensor22, 2); nvinfer1::IElementWiseLayer* conv23 = C3(network, weightMap, *cat22->getOutput(0), get_width_5u(1024, gw), get_width_5u(1024, gw), get_depth(3, gd), false, 0.5, "model.23"); //********************************************* cat head P3 //******************************************** /******************************************************************************************************* ********************************************* YOLOV5U OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width_5u(256, gw); // output0 nvinfer1::IElementWiseLayer* conv24_cv2_0_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.24.cv2.0.0"); nvinfer1::IElementWiseLayer* conv24_cv2_0_1 = convBnSiLU(network, weightMap, *conv24_cv2_0_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.24.cv2.0.1"); nvinfer1::IConvolutionLayer* conv24_cv2_0_2 = network->addConvolutionNd(*conv24_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.24.cv2.0.2.weight"], weightMap["model.24.cv2.0.2.bias"]); conv24_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv24_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv24_cv3_0_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.24.cv3.0.0"); nvinfer1::IElementWiseLayer* conv24_cv3_0_1 = convBnSiLU(network, weightMap, *conv24_cv3_0_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.24.cv3.0.1"); nvinfer1::IConvolutionLayer* conv24_cv3_0_2 = network->addConvolutionNd(*conv24_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.24.cv3.0.2.weight"], weightMap["model.24.cv3.0.2.bias"]); conv24_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv24_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor24_0[] = {conv24_cv2_0_2->getOutput(0), conv24_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat24_0 = network->addConcatenation(inputTensor24_0, 2); // output1 nvinfer1::IElementWiseLayer* conv24_cv2_1_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.24.cv2.1.0"); nvinfer1::IElementWiseLayer* conv24_cv2_1_1 = convBnSiLU(network, weightMap, *conv24_cv2_1_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.24.cv2.1.1"); nvinfer1::IConvolutionLayer* conv24_cv2_1_2 = network->addConvolutionNd(*conv24_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.24.cv2.1.2.weight"], weightMap["model.24.cv2.1.2.bias"]); conv24_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv24_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv24_cv3_1_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.24.cv3.1.0"); nvinfer1::IElementWiseLayer* conv24_cv3_1_1 = convBnSiLU(network, weightMap, *conv24_cv3_1_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.24.cv3.1.1"); nvinfer1::IConvolutionLayer* conv24_cv3_1_2 = network->addConvolutionNd(*conv24_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.24.cv3.1.2.weight"], weightMap["model.24.cv3.1.2.bias"]); conv24_cv3_1_2->setStride(nvinfer1::DimsHW{1, 1}); conv24_cv3_1_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor24_1[] = {conv24_cv2_1_2->getOutput(0), conv24_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat24_1 = network->addConcatenation(inputTensor24_1, 2); // output2 nvinfer1::IElementWiseLayer* conv24_cv2_2_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.24.cv2.2.0"); nvinfer1::IElementWiseLayer* conv24_cv2_2_1 = convBnSiLU(network, weightMap, *conv24_cv2_2_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.24.cv2.2.1"); nvinfer1::IConvolutionLayer* conv24_cv2_2_2 = network->addConvolutionNd(*conv24_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.24.cv2.2.2.weight"], weightMap["model.24.cv2.2.2.bias"]); conv24_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv24_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv24_cv3_2_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.24.cv3.2.0"); nvinfer1::IElementWiseLayer* conv24_cv3_2_1 = convBnSiLU(network, weightMap, *conv24_cv3_2_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.24.cv3.2.1"); nvinfer1::IConvolutionLayer* conv24_cv3_2_2 = network->addConvolutionNd(*conv24_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.24.cv3.2.2.weight"], weightMap["model.24.cv3.2.2.bias"]); conv24_cv3_2_2->setStride(nvinfer1::DimsHW{1, 1}); conv24_cv3_2_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor24_2[] = {conv24_cv2_2_2->getOutput(0), conv24_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat24_2 = network->addConcatenation(inputTensor24_2, 2); /******************************************************************************************************* ********************************************* YOLOV5U DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // det0 nvinfer1::IShuffleLayer* shuffle24_0 = network->addShuffle(*cat24_0->getOutput(0)); shuffle24_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split24_0_0 = network->addSlice( *shuffle24_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split24_0_1 = network->addSlice( *shuffle24_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl24_0 = DFL(network, weightMap, *split24_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.24.dfl.conv.weight"); nvinfer1::ITensor* inputTensor24_dfl_0[] = {dfl24_0->getOutput(0), split24_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat24_dfl_0 = network->addConcatenation(inputTensor24_dfl_0, 2); // det1 nvinfer1::IShuffleLayer* shuffle24_1 = network->addShuffle(*cat24_1->getOutput(0)); shuffle24_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split24_1_0 = network->addSlice( *shuffle24_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split24_1_1 = network->addSlice( *shuffle24_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl24_1 = DFL(network, weightMap, *split24_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.24.dfl.conv.weight"); nvinfer1::ITensor* inputTensor24_dfl_1[] = {dfl24_1->getOutput(0), split24_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat24_dfl_1 = network->addConcatenation(inputTensor24_dfl_1, 2); // det2 nvinfer1::IShuffleLayer* shuffle24_2 = network->addShuffle(*cat24_2->getOutput(0)); shuffle24_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split24_2_0 = network->addSlice( *shuffle24_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split24_2_1 = network->addSlice( *shuffle24_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl24_2 = DFL(network, weightMap, *split24_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.24.dfl.conv.weight"); nvinfer1::ITensor* inputTensor24_dfl_2[] = {dfl24_2->getOutput(0), split24_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat24_dfl_2 = network->addConcatenation(inputTensor24_dfl_2, 2); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat24_dfl_0, cat24_dfl_1, cat24_dfl_2}, strides, stridesLength, kNumClass, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV5U-P6 INPUT *********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV5U-P6 BACKBONE ********************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width_5u(64, gw), 6, 2, calculateP(6), "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width_5u(128, gw), 3, 2, calculateP(3), "model.1"); // 11233 nvinfer1::IElementWiseLayer* conv2 = C3(network, weightMap, *conv1->getOutput(0), get_width_5u(128, gw), get_width_5u(128, gw), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.3"); // 22466 nvinfer1::IElementWiseLayer* conv4 = C3(network, weightMap, *conv3->getOutput(0), get_width_5u(256, gw), get_width_5u(256, gw), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.5"); // 22466 nvinfer1::IElementWiseLayer* conv6 = C3(network, weightMap, *conv5->getOutput(0), get_width_5u(512, gw), get_width_5u(512, gw), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width_5u(768, gw), 3, 2, calculateP(3), "model.7"); // 11233 nvinfer1::IElementWiseLayer* conv8 = C3(network, weightMap, *conv7->getOutput(0), get_width_5u(768, gw), get_width_5u(768, gw), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width_5u(1024, gw), 3, 2, calculateP(3), "model.9"); // 11233 nvinfer1::IElementWiseLayer* conv10 = C3(network, weightMap, *conv9->getOutput(0), get_width_5u(1024, gw), get_width_5u(1024, gw), get_depth(3, gd), true, 0.5, "model.10"); nvinfer1::IElementWiseLayer* conv11 = SPPF(network, weightMap, *conv10->getOutput(0), get_width_5u(1024, gw), get_width_5u(1024, gw), 5, "model.11"); /******************************************************************************************************* ********************************************* YOLOV5U-P6 HEAD ********************************************* *******************************************************************************************************/ //********************************************* cat backbone P5 //******************************************** nvinfer1::IElementWiseLayer* conv12 = convBnSiLU(network, weightMap, *conv11->getOutput(0), get_width_5u(768, gw), 1, 1, calculateP(1), "model.12"); nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); assert(upsample13); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample13->setOutputDimensions(conv8->getOutput(0)->getDimensions()); nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv8->getOutput(0)}; nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); nvinfer1::IElementWiseLayer* conv15 = C3(network, weightMap, *cat14->getOutput(0), get_width_5u(768, gw), get_width_5u(768, gw), get_depth(3, gd), false, 0.5, "model.15"); //********************************************* cat backbone P5 //******************************************** //********************************************* cat backbone P4 //******************************************** nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width_5u(512, gw), 1, 1, calculateP(1), "model.16"); nvinfer1::IResizeLayer* upsample17 = network->addResize(*conv16->getOutput(0)); assert(upsample17); upsample17->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample17->setOutputDimensions(conv6->getOutput(0)->getDimensions()); nvinfer1::ITensor* inputTensor18[] = {upsample17->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2); nvinfer1::IElementWiseLayer* conv19 = C3(network, weightMap, *cat18->getOutput(0), get_width_5u(512, gw), get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.19"); //********************************************* cat backbone P4 //******************************************** //********************************************* cat backbone P3 //******************************************** nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width_5u(256, gw), 1, 1, calculateP(1), "model.20"); nvinfer1::IResizeLayer* upsample21 = network->addResize(*conv20->getOutput(0)); assert(upsample21); upsample21->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample21->setOutputDimensions(conv4->getOutput(0)->getDimensions()); nvinfer1::ITensor* inputTensor22[] = {upsample21->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22 = network->addConcatenation(inputTensor22, 2); nvinfer1::IElementWiseLayer* conv23 = C3(network, weightMap, *cat22->getOutput(0), get_width_5u(256, gw), get_width_5u(256, gw), get_depth(3, gd), false, 0.5, "model.23"); //********************************************* cat backbone P3 //******************************************** //********************************************* cat head P4 //******************************************** nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.24"); nvinfer1::ITensor* inputTensor25[] = {conv24->getOutput(0), conv20->getOutput(0)}; nvinfer1::IConcatenationLayer* cat25 = network->addConcatenation(inputTensor25, 2); nvinfer1::IElementWiseLayer* conv26 = C3(network, weightMap, *cat25->getOutput(0), get_width_5u(512, gw), get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.26"); //********************************************* cat head P4 //******************************************** //********************************************* cat head P5 //******************************************** nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.27"); nvinfer1::ITensor* inputTensor28[] = {conv27->getOutput(0), conv16->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28 = network->addConcatenation(inputTensor28, 2); nvinfer1::IElementWiseLayer* conv29 = C3(network, weightMap, *cat28->getOutput(0), get_width_5u(768, gw), get_width_5u(768, gw), get_depth(3, gd), false, 0.5, "model.29"); //********************************************* cat head P5 //******************************************** //********************************************* cat head P6 //******************************************** nvinfer1::IElementWiseLayer* conv30 = convBnSiLU(network, weightMap, *conv29->getOutput(0), get_width_5u(768, gw), 3, 2, calculateP(3), "model.30"); nvinfer1::ITensor* inputTensor31[] = {conv30->getOutput(0), conv12->getOutput(0)}; nvinfer1::IConcatenationLayer* cat31 = network->addConcatenation(inputTensor31, 2); nvinfer1::IElementWiseLayer* conv32 = C3(network, weightMap, *cat31->getOutput(0), get_width_5u(768, gw), get_width_5u(1024, gw), get_depth(3, gd), false, 0.5, "model.32"); //********************************************* cat head P6 //******************************************** /******************************************************************************************************* ********************************************* YOLOV5U-P6 OUTPUT ******************************************* *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width_5u(256, gw); // output0 nvinfer1::IElementWiseLayer* conv33_cv2_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.0.0"); nvinfer1::IElementWiseLayer* conv33_cv2_0_1 = convBnSiLU(network, weightMap, *conv33_cv2_0_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.0.1"); nvinfer1::IConvolutionLayer* conv33_cv2_0_2 = network->addConvolutionNd(*conv33_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv2.0.2.weight"], weightMap["model.33.cv2.0.2.bias"]); conv33_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv33_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv33_cv3_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.0.0"); nvinfer1::IElementWiseLayer* conv33_cv3_0_1 = convBnSiLU(network, weightMap, *conv33_cv3_0_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.0.1"); nvinfer1::IConvolutionLayer* conv33_cv3_0_2 = network->addConvolutionNd(*conv33_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv3.0.2.weight"], weightMap["model.33.cv3.0.2.bias"]); conv33_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv33_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor33_0[] = {conv33_cv2_0_2->getOutput(0), conv33_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_0 = network->addConcatenation(inputTensor33_0, 2); // output1 nvinfer1::IElementWiseLayer* conv33_cv2_1_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.1.0"); nvinfer1::IElementWiseLayer* conv33_cv2_1_1 = convBnSiLU(network, weightMap, *conv33_cv2_1_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.1.1"); nvinfer1::IConvolutionLayer* conv33_cv2_1_2 = network->addConvolutionNd(*conv33_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv2.1.2.weight"], weightMap["model.33.cv2.1.2.bias"]); conv33_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv33_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv33_cv3_1_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.1.0"); nvinfer1::IElementWiseLayer* conv33_cv3_1_1 = convBnSiLU(network, weightMap, *conv33_cv3_1_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.1.1"); nvinfer1::IConvolutionLayer* conv33_cv3_1_2 = network->addConvolutionNd(*conv33_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv3.1.2.weight"], weightMap["model.33.cv3.1.2.bias"]); conv33_cv3_1_2->setStride(nvinfer1::DimsHW{1, 1}); conv33_cv3_1_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor33_1[] = {conv33_cv2_1_2->getOutput(0), conv33_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_1 = network->addConcatenation(inputTensor33_1, 2); // output2 nvinfer1::IElementWiseLayer* conv33_cv2_2_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.2.0"); nvinfer1::IElementWiseLayer* conv33_cv2_2_1 = convBnSiLU(network, weightMap, *conv33_cv2_2_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.2.1"); nvinfer1::IConvolutionLayer* conv33_cv2_2_2 = network->addConvolutionNd(*conv33_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv2.2.2.weight"], weightMap["model.33.cv2.2.2.bias"]); conv33_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv33_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv33_cv3_2_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.2.0"); nvinfer1::IElementWiseLayer* conv33_cv3_2_1 = convBnSiLU(network, weightMap, *conv33_cv3_2_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.2.1"); nvinfer1::IConvolutionLayer* conv33_cv3_2_2 = network->addConvolutionNd(*conv33_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv3.2.2.weight"], weightMap["model.33.cv3.2.2.bias"]); conv33_cv3_2_2->setStride(nvinfer1::DimsHW{1, 1}); conv33_cv3_2_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor33_2[] = {conv33_cv2_2_2->getOutput(0), conv33_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_2 = network->addConcatenation(inputTensor33_2, 2); // output3 nvinfer1::IElementWiseLayer* conv33_cv2_3_0 = convBnSiLU(network, weightMap, *conv32->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.3.0"); nvinfer1::IElementWiseLayer* conv33_cv2_3_1 = convBnSiLU(network, weightMap, *conv33_cv2_3_0->getOutput(0), base_in_channel, 3, 1, calculateP(3), "model.33.cv2.3.1"); nvinfer1::IConvolutionLayer* conv33_cv2_3_2 = network->addConvolutionNd(*conv33_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv2.3.2.weight"], weightMap["model.33.cv2.3.2.bias"]); conv33_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv33_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv33_cv3_3_0 = convBnSiLU(network, weightMap, *conv32->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.3.0"); nvinfer1::IElementWiseLayer* conv33_cv3_3_1 = convBnSiLU(network, weightMap, *conv33_cv3_3_0->getOutput(0), base_out_channel, 3, 1, calculateP(3), "model.33.cv3.3.1"); nvinfer1::IConvolutionLayer* conv33_cv3_3_2 = network->addConvolutionNd(*conv33_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.33.cv3.3.2.weight"], weightMap["model.33.cv3.3.2.bias"]); conv33_cv3_3_2->setStride(nvinfer1::DimsHW{1, 1}); conv33_cv3_3_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor33_3[] = {conv33_cv2_3_2->getOutput(0), conv33_cv3_3_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_3 = network->addConcatenation(inputTensor33_3, 2); /******************************************************************************************************* ********************************************* YOLOV5U-P6 DETECT ******************************************* *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // det0 nvinfer1::IShuffleLayer* shuffle33_0 = network->addShuffle(*cat33_0->getOutput(0)); shuffle33_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split33_0_0 = network->addSlice( *shuffle33_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split33_0_1 = network->addSlice( *shuffle33_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl33_0 = DFL(network, weightMap, *split33_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.33.dfl.conv.weight"); nvinfer1::ITensor* inputTensor33_dfl_0[] = {dfl33_0->getOutput(0), split33_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_dfl_0 = network->addConcatenation(inputTensor33_dfl_0, 2); // det1 nvinfer1::IShuffleLayer* shuffle33_1 = network->addShuffle(*cat33_1->getOutput(0)); shuffle33_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split33_1_0 = network->addSlice( *shuffle33_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split33_1_1 = network->addSlice( *shuffle33_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl33_1 = DFL(network, weightMap, *split33_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.33.dfl.conv.weight"); nvinfer1::ITensor* inputTensor33_dfl_1[] = {dfl33_1->getOutput(0), split33_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_dfl_1 = network->addConcatenation(inputTensor33_dfl_1, 2); // det2 nvinfer1::IShuffleLayer* shuffle33_2 = network->addShuffle(*cat33_2->getOutput(0)); shuffle33_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split33_2_0 = network->addSlice( *shuffle33_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split33_2_1 = network->addSlice( *shuffle33_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl33_2 = DFL(network, weightMap, *split33_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.33.dfl.conv.weight"); nvinfer1::ITensor* inputTensor33_dfl_2[] = {dfl33_2->getOutput(0), split33_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_dfl_2 = network->addConcatenation(inputTensor33_dfl_2, 2); // det3 nvinfer1::IShuffleLayer* shuffle33_3 = network->addShuffle(*cat33_3->getOutput(0)); shuffle33_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); nvinfer1::ISliceLayer* split33_3_0 = network->addSlice( *shuffle33_3->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split33_3_1 = network->addSlice( *shuffle33_3->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl33_3 = DFL(network, weightMap, *split33_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, 1, 0, "model.33.dfl.conv.weight"); nvinfer1::ITensor* inputTensor33_dfl_3[] = {dfl33_3->getOutput(0), split33_3_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat33_dfl_3 = network->addConcatenation(inputTensor33_dfl_3, 2); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat33_dfl_0, cat33_dfl_1, cat33_dfl_2, cat33_dfl_3}, strides, stridesLength, kNumClass, false, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } nvinfer1::IHostMemory* buildEngineYolov8Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); /******************************************************************************************************* ****************************************** YOLOV8 INPUT ********************************************** *******************************************************************************************************/ nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); assert(data); /******************************************************************************************************* ***************************************** YOLOV8 BACKBONE ******************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); assert(upsample10); upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample10->setScales(scale, 3); nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); assert(upsample13); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); upsample13->setScales(scale, 3); nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); nvinfer1::IElementWiseLayer* conv15 = C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.16"); nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2); nvinfer1::IElementWiseLayer* conv18 = C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.19"); nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2); nvinfer1::IElementWiseLayer* conv21 = C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ****************************************** *******************************************************************************************************/ int base_in_channel = (gw == 1.25) ? 80 : 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kObbNumClass, 100)) : get_width(256, gw, max_channels); // output0 nvinfer1::IElementWiseLayer* conv22_cv2_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); nvinfer1::IElementWiseLayer* conv22_cv2_0_1 = convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); nvinfer1::IConvolutionLayer* conv22_cv2_0_2 = network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); nvinfer1::IConvolutionLayer* conv22_cv3_0_2 = network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2); // output1 nvinfer1::IElementWiseLayer* conv22_cv2_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); nvinfer1::IElementWiseLayer* conv22_cv2_1_1 = convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); nvinfer1::IConvolutionLayer* conv22_cv2_1_2 = network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::IElementWiseLayer* conv22_cv3_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); nvinfer1::IConvolutionLayer* conv22_cv3_1_2 = network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2); // output2 nvinfer1::IElementWiseLayer* conv22_cv2_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); nvinfer1::IElementWiseLayer* conv22_cv2_2_1 = convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); nvinfer1::IConvolutionLayer* conv22_cv2_2_2 = network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); nvinfer1::IElementWiseLayer* conv22_cv3_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); nvinfer1::IConvolutionLayer* conv22_cv3_2_2 = network->addConvolution(*conv22_cv3_2_1->getOutput(0), kObbNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); shuffle22_0->setReshapeDimensions( nvinfer1::Dims2{64 + kObbNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kObbNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); shuffle22_1->setReshapeDimensions( nvinfer1::Dims2{64 + kObbNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kObbNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.22.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); shuffle22_2->setReshapeDimensions( nvinfer1::Dims2{64 + kObbNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kObbNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.22.dfl.conv.weight"); // det0 auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "obb"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), shuffle_conv15->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); // det1 auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", (kInputH / strides[1]) * (kInputW / strides[1]), gw, "obb"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), shuffle_conv18->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); // det2 auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "obb"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), shuffle_conv21->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, kObbNumClass, false, false, true); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov8/src/postprocess.cpp ================================================ #include "postprocess.h" #include #include // Include this header for printing #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0]; r = bbox[2]; t = bbox[1] - (kInputH - r_w * img.rows) / 2; b = bbox[3] - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - (kInputW - r_h * img.cols) / 2; r = bbox[2] - (kInputW - r_h * img.cols) / 2; t = bbox[1]; b = bbox[3]; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] / r_w; r = bbox[2] / r_w; t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] /= r_w; lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; // lmk[i + 2] } } else { l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; t = bbox[1] / r_h; b = bbox[3] / r_h; for (int i = 0; i < kNumberOfPoints * 3; i += 3) { lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; lmk[i + 1] /= r_h; // lmk[i + 2] } } l = std::max(0.0f, l); t = std::max(0.0f, t); int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); return cv::Rect(int(round(l)), int(round(t)), width, height); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), (std::min)(lbox[2], rbox[2]), (std::max)(lbox[1], rbox[1]), (std::min)(lbox[3], rbox[3]), }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; return interBoxS / unionBoxS; } static bool cmp(const Detection& a, const Detection& b) { if (a.conf == b.conf) { return a.bbox[0] < b.bbox[0]; } return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh || isnan(output[1 + det_size * i + 4])) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; res.push_back(det); } } } void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } } void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch) { const std::vector> skeleton_pairs = { {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); for (int k = 0; k < kNumberOfPoints * 3; k += 3) { if (res[j].keypoints[k + 2] > 0.5) { cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, cv::Scalar(0, 0x27, 0xC1), -1); } } for (const auto& bone : skeleton_pairs) { int kp1_idx = bone.first * 3; int kp2_idx = bone.second * 3; if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); } } } } } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } void process_decode_ptr_host_obb(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; det.angle = decode_ptr_host[basic_pos + 7]; res.push_back(det); } } } void batch_process_obb(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } std::tuple convariance_matrix(Detection res) { float w = res.bbox[2]; float h = res.bbox[3]; float a = w * w / 12.0; float b = h * h / 12.0; float c = res.angle; float cos_r = std::cos(c); float sin_r = std::sin(c); float cos_r2 = cos_r * cos_r; float sin_r2 = sin_r * sin_r; float a_val = a * cos_r2 + b * sin_r2; float b_val = a * sin_r2 + b * cos_r2; float c_val = (a - b) * cos_r * sin_r; return std::make_tuple(a_val, b_val, c_val); } static float probiou(const Detection& res1, const Detection& res2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; std::tuple matrix1 = {a1, b1, c1}; std::tuple matrix2 = {a2, b2, c2}; matrix1 = convariance_matrix(res1); matrix2 = convariance_matrix(res2); a1 = std::get<0>(matrix1); b1 = std::get<1>(matrix1); c1 = std::get<2>(matrix1); a2 = std::get<0>(matrix2); b2 = std::get<1>(matrix2); c2 = std::get<2>(matrix2); float x1 = res1.bbox[0], y1 = res1.bbox[1]; float x2 = res2.bbox[0], y2 = res2.bbox[1]; float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps); float t3 = std::log( ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) / (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = std::max(std::min(bd, 100.0f), eps); float hd = std::sqrt(1.0 - std::exp(-bd) + eps); return 1 - hd; } void nms_obb(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0]; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (probiou(item, dets[n]) >= nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms_obb(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms_obb(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } static std::vector get_corner(cv::Mat& img, const Detection& box) { float cos_value, sin_value; // Calculate center point and width/height float x1 = box.bbox[0]; float y1 = box.bbox[1]; float w = box.bbox[2]; float h = box.bbox[3]; float angle = box.angle * 180.0f / CV_PI; // Convert radians to degrees // Print original angle std::cout << "Original angle: " << angle << std::endl; // Swap width and height if height is greater than or equal to width if (h >= w) { std::swap(w, h); angle = fmod(angle + 90.0f, 180.0f); // Adjust angle to be within [0, 180) } // Ensure the angle is between 0 and 180 degrees if (angle < 0) { angle += 360.0f; // Convert to positive value } if (angle > 180.0f) { angle -= 180.0f; // Subtract 180 from angles greater than 180 } // Print adjusted angle std::cout << "Adjusted angle: " << angle << std::endl; // Convert to normal angle value float normal_angle = fmod(angle, 180.0f); if (normal_angle < 0) { normal_angle += 180.0f; // Ensure it's a positive value } // Print normal angle value std::cout << "Normal angle: " << normal_angle << std::endl; cos_value = std::cos(angle * CV_PI / 180.0f); // Convert to radians sin_value = std::sin(angle * CV_PI / 180.0f); // Calculate each corner point float l = x1 - w / 2; // Left boundary float r = x1 + w / 2; // Right boundary float t = y1 - h / 2; // Top boundary float b = y1 + h / 2; // Bottom boundary // Use get_rect function to scale the coordinates float bbox[4] = {l, t, r, b}; cv::Rect rect = get_rect(img, bbox); float x_ = (rect.x + rect.x + rect.width) / 2; // Center x float y_ = (rect.y + rect.y + rect.height) / 2; // Center y float width = rect.width; // Width float height = rect.height; // Height // Calculate each corner point std::vector corner_points(4); float vec1x = width / 2 * cos_value; float vec1y = width / 2 * sin_value; float vec2x = -height / 2 * sin_value; float vec2y = height / 2 * cos_value; corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))); // Top-left corner corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))); // Top-right corner corner_points[2] = cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))); // Bottom-right corner corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); // Bottom-left corner // Check and adjust corner points to ensure the rectangle is parallel to image boundaries for (auto& point : corner_points) { point.x = std::max(0, std::min(point.x, img.cols - 1)); point.y = std::max(0, std::min(point.y, img.rows - 1)); } return corner_points; } void draw_bbox_obb(std::vector& img_batch, std::vector>& res_batch) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; auto& img = img_batch[i]; for (auto& obj : res) { auto color = colors[(int)obj.class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); auto corner_points = get_corner(img, obj); cv::polylines(img, std::vector>{corner_points}, true, bgr, 1); auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf)); cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr); int width = textsize.width; int height = textsize.height; bool outside = (corner_points[0].y - height >= 3) ? true : false; cv::Point p1(corner_points[0].x, corner_points[0].y), p2; p2.x = corner_points[0].x + width; if (outside) { p2.y = corner_points[0].y - height - 3; } else { p2.y = corner_points[0].y + height + 3; } cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA); cv::putText( img, text, cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)), 0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA); } } } ================================================ FILE: yolov8/src/postprocess.cu ================================================ // // Created by lindsay on 23-7-17. // #include "postprocess.h" #include "types.h" static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; //[center_x center_y w h conf class_id mask[32] keypoints[51] angle] float cx = pitem[0]; float cy = pitem[1]; float width = pitem[2]; float height = pitem[3]; float label = pitem[5]; float angle = pitem[89]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = cx; *pout_item++ = cy; *pout_item++ = width; *pout_item++ = height; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore *pout_item++ = angle; } static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; float left = pitem[0]; float top = pitem[1]; float right = pitem[2]; float bottom = pitem[3]; float label = pitem[5]; float* pout_item = parray + 1 + index * bbox_element; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) { float a_val = w * w / 12.0f; float b_val = h * h / 12.0f; float cos_r = cosf(r); float sin_r = sinf(r); a = a_val * cos_r * cos_r + b_val * sin_r * sin_r; b = a_val * sin_r * sin_r + b_val * cos_r * cos_r; c = (a_val - b_val) * sin_r * cos_r; } static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2, float h2, float r2, float eps = 1e-7) { // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf. float a1, b1, c1, a2, b2, c2; convariance_matrix(w1, h1, r1, a1, b1, c1); convariance_matrix(w2, h2, r2, a2, b2, c2); float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps); float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) / (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps); float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3; bd = fmaxf(fminf(bd, 100.0f), eps); float hd = sqrtf(1.0f - expf(-bd) + eps); return 1 - hd; } static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 0; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1], pitem[2], pitem[3], pitem[7]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel<<>>(parray, max_objects, nms_threshold); } void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel_obb<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel_obb<<>>(parray, max_objects, nms_threshold); } ================================================ FILE: yolov8/src/preprocess.cu ================================================ #include "cuda_utils.h" #include "preprocess.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; __global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>(img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov8/yolov8_5u_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; if (is_p == 6) { serialized_engine = buildEngineYolov8_5uDetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else { serialized_engine = buildEngineYolov8_5uDet(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and // output tensors. Note that indices are guaranteed to be less than // IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueue(batchsize, buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); // cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.33; gw = 0.25; max_channels = 1024; } else if (sub_type[0] == 's') { gd = 0.33; gw = 0.50; max_channels = 1024; } else if (sub_type[0] == 'm') { gd = 0.67; gw = 0.75; max_channels = 576; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; } else if (sub_type[0] == 'x') { gd = 1.33; gw = 1.25; max_channels = 640; } else { return false; } if (sub_type.size() == 2 && sub_type[1] == '6') { is_p = 6; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string sub_type = ""; std::string cuda_post_process = ""; int model_bboxes; int is_p = 0; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov8_5u_det -s [.wts] [.engine] " "[n/s/m/l/x//n6/s6/m6/l6/x6] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov8_5u_det -d [.engine] ../samples [c/g]// deserialize " "plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { // Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} // std::cout << std::endl; return 0; } ================================================ FILE: yolov8/yolov8_5u_det_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov8 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov8TRT(object): """ description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray # pred = np.reshape(output[1:], (-1, 38))[:num, :] pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolov5xu.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov8TRT instance yolov8_wrapper = YoLov8TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov8/yolov8_cls.cpp ================================================ #include "calibrator.h" #include "config.h" #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "utils.h" #include #include #include #include #include using namespace nvinfer1; static Logger gLogger; const static int kOutputSize = kClsNumClass; void batch_preprocess(std::vector& imgs, float* output, int dst_width = 224, int dst_height = 224) { for (size_t b = 0; b < imgs.size(); b++) { int h = imgs[b].rows; int w = imgs[b].cols; int m = std::min(h, w); int top = (h - m) / 2; int left = (w - m) / 2; cv::Mat img = imgs[b](cv::Rect(left, top, m, m)); cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR); cv::cvtColor(img, img, cv::COLOR_BGR2RGB); img.convertTo(img, CV_32F, 1 / 255.0); std::vector channels(3); cv::split(img, channels); // CHW format for (int c = 0; c < 3; ++c) { int i = 0; for (int row = 0; row < dst_height; ++row) { for (int col = 0; col < dst_width; ++col) { output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] = channels[c].at(row, col); ++i; } } } } } std::vector softmax(float* prob, int n) { std::vector res; float sum = 0.0f; float t; for (int i = 0; i < n; i++) { t = expf(prob[i]); res.push_back(t); sum += t; } for (int i = 0; i < n; i++) { res[i] /= sum; } return res; } std::vector topk(const std::vector& vec, int k) { std::vector topk_index; std::vector vec_index(vec.size()); std::iota(vec_index.begin(), vec_index.end(), 0); std::sort(vec_index.begin(), vec_index.end(), [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; }); int k_num = std::min(vec.size(), k); for (int i = 0; i < k_num; ++i) { topk_index.push_back(vec_index[i]); } return topk_index; } std::vector read_classes(std::string file_name) { std::vector classes; std::ifstream ifs(file_name, std::ios::in); if (!ifs.is_open()) { std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl; assert(0); } std::string s; while (std::getline(ifs, s)) { classes.push_back(s); } ifs.close(); return classes; } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, float& gd, float& gw, std::string& img_dir) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto net = std::string(argv[4]); if (net[0] == 'n') { gd = 0.33; gw = 0.25; } else if (net[0] == 's') { gd = 0.33; gw = 0.50; } else if (net[0] == 'm') { gd = 0.67; gw = 0.75; } else if (net[0] == 'l') { gd = 1.0; gw = 1.0; } else if (net[0] == 'x') { gd = 1.0; gw = 1.25; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, float** cpu_input_buffer, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW]; *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize) { CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } void serialize_engine(unsigned int max_batchsize, float& gd, float& gw, std::string& wts_name, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine IHostMemory* serialized_engine = nullptr; //engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw); assert(serialized_engine); // Save engine to file std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "Could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); // Close everything down delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; float gd = 0.0f, gw = 0.0f; std::string img_dir; if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw] // serialize model to plan file" << std::endl; std::cerr << "./yolov8_cls -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, gd, gw, wts_name, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Prepare cpu and gpu buffers float* device_buffers[2]; float* cpu_input_buffer = nullptr; float* output_buffer_host = nullptr; prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // Read imagenet labels auto classes = read_classes("imagenet_classes.txt"); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess batch_preprocess(img_batch, cpu_input_buffer); // Run inference auto start = std::chrono::system_clock::now(); infer(*context, stream, (void**)device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; // Postprocess and get top-k result for (size_t b = 0; b < img_name_batch.size(); b++) { float* p = &output_buffer_host[b * kOutputSize]; auto res = softmax(p, kOutputSize); auto topk_idx = topk(res, 3); std::cout << img_name_batch[b] << std::endl; for (auto idx : topk_idx) { std::cout << " " << classes[idx] << " " << res[idx] << std::endl; } } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] cpu_input_buffer; delete[] output_buffer_host; // Destroy the engine delete context; delete engine; delete runtime; return 0; } ================================================ FILE: yolov8/yolov8_cls_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import os import shutil import sys import threading import time import cv2 import numpy as np import torch import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret with open("imagenet_classes.txt") as f: classes = [line.strip() for line in f.readlines()] class YoLov8TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] self.mean = (0.485, 0.456, 0.406) self.std = (0.229, 0.224, 0.225) for binding in engine: print('binding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape( binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_input_image = np.empty( shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): batch_image_raw.append(image_raw) input_image = self.preprocess_cls_image(image_raw) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls( output) cv2.putText(batch_image_raw[i], str( classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) print(classes_ls, predicted_conf_ls) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224): """ description: Convert BGR image to RGB, crop the center square frame, resize it to target size, normalize to [0,1], transform to NCHW format. param: raw_bgr_image: numpy array, raw BGR image dst_width: int, target image width dst_height: int, target image height return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape # Crop the center square frame m = min(h, w) top = (h - m) // 2 left = (w - m) // 2 image = raw_bgr_image[top:top + m, left:left + m] # Resize the image with target size while maintaining ratio image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR) # Convert BGR to RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Normalize to [0,1] image = image.astype(np.float32) / 255.0 # HWC to CHW format image = image.transpose(2, 0, 1) # CHW to NCHW format (add batch dimension) image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order" image = np.ascontiguousarray(image) batch_data = np.expand_dims(image, axis=0) return batch_data def postprocess_cls(self, output_data): classes_ls = [] predicted_conf_ls = [] category_id_ls = [] output_data = output_data.reshape(self.batch_size, -1) output_data = torch.Tensor(output_data) p = torch.nn.functional.softmax(output_data, dim=1) score, index = torch.topk(p, 3) for ind in range(index.shape[0]): input_category_id = index[ind][0].item() # 716 category_id_ls.append(input_category_id) predicted_confidence = score[ind][0].item() predicted_conf_ls.append(predicted_confidence) classes_ls.append(classes[input_category_id]) return classes_ls, predicted_conf_ls, category_id_ls class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer( self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format( self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer( self.yolov8_wrapper.get_raw_image_zeros()) print( 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine engine_file_path = "./yolov8x-cls-fp32.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov8TRT instance yolov8_wrapper = YoLov8TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "samples/" image_path_batches = get_img_path_batches( yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov8/yolov8_det.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; if (is_p == 6) { serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (is_p == 2) { serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else { serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueue(batchsize, buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.33; gw = 0.25; max_channels = 1024; } else if (sub_type[0] == 's') { gd = 0.33; gw = 0.50; max_channels = 1024; } else if (sub_type[0] == 'm') { gd = 0.67; gw = 0.75; max_channels = 576; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.25; max_channels = 640; } else { return false; } if (sub_type.size() == 2 && sub_type[1] == '6') { is_p = 6; } else if (sub_type.size() == 2 && sub_type[1] == '2') { is_p = 2; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string sub_type = ""; std::string cuda_post_process = ""; int model_bboxes; int is_p = 0; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov8/yolov8_det_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov8 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov8TRT(object): """ description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray # pred = np.reshape(output[1:], (-1, 38))[:num, :] pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolov8n.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov8TRT instance yolov8_wrapper = YoLov8TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov8/yolov8_obb.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; if (is_p == 6) { std::cout << "p6 is not supported right now" << std::endl; } else if (is_p == 2) { std::cout << "p2 is not supported right now" << std::endl; } else { serialized_engine = buildEngineYolov8Obb(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueue(batchsize, buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.33; gw = 0.25; max_channels = 1024; } else if (sub_type[0] == 's') { gd = 0.33; gw = 0.50; max_channels = 1024; } else if (sub_type[0] == 'm') { gd = 0.67; gw = 0.75; max_channels = 576; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.25; max_channels = 640; } else { return false; } if (sub_type.size() == 2 && sub_type[1] == '6') { is_p = 6; } else if (sub_type.size() == 2 && sub_type[1] == '2') { is_p = 2; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string sub_type = ""; std::string cuda_post_process = ""; int model_bboxes; int is_p = 0; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms_obb(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //Process gpu decode and nms results batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // Draw bounding boxes draw_bbox_obb(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov8/yolov8_obb_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import sys import threading import time import cv2 import math import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 INPUT_W = 640 INPUT_H = 640 class Detection: def __init__(self, bbox, score, class_id, angle): self.bbox = bbox self.score = score self.class_id = class_id self.angle = angle def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def get_corner(img, box: Detection): """ description: Get the four corner points of the rotated bounding box param: img: an opencv image object (numpy array) box: a Detection object containing bbox [cx,cy,w,h] and angle (radians) return: corners: four corner points of the rotated bounding box as numpy array [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] """ # Extract box parameters cx, cy, w, h = box.bbox angle = box.angle * 180.0 / math.pi # Convert radians to degrees # Swap width and height if height >= width if h >= w: w, h = h, w angle = (angle + 90.0) % 180.0 # Adjust angle # Ensure angle is between 0 and 180 degrees if angle < 0: angle += 360.0 if angle > 180.0: angle -= 180.0 # Convert to normalized angle (0-180) normal_angle = angle % 180.0 if normal_angle < 0: normal_angle += 180.0 # Convert back to radians for calculation angle_rad = angle * math.pi / 180.0 cos_val = math.cos(angle_rad) sin_val = math.sin(angle_rad) # Calculate boundaries l_x = cx - w / 2 r_x = cx + w / 2 t_y = cy - h / 2 b_y = cy + h / 2 # Scale coordinates using get_rect_obb (matching C++ version) bbox = [l_x, t_y, r_x, b_y] rect = get_rect_obb(img, bbox) # Calculate center and dimensions of scaled box x_ = (rect[0] + rect[0] + rect[2]) / 2 # rect.x + rect.width/2 y_ = (rect[1] + rect[1] + rect[3]) / 2 # rect.y + rect.height/2 width = rect[2] height = rect[3] # Calculate vectors vec1x = width / 2 * cos_val vec1y = width / 2 * sin_val vec2x = -height / 2 * sin_val vec2y = height / 2 * cos_val # Calculate four corners corners = np.array([ [int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y))], # Top-left [int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y))], # Top-right [int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y))], # Bottom-right [int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))] # Bottom-left ], dtype=np.int32) # Clip to image boundaries h, w = img.shape[:2] corners[:, 0] = np.clip(corners[:, 0], 0, w - 1) corners[:, 1] = np.clip(corners[:, 1], 0, h - 1) return corners def get_rect_obb(img, bbox): """ Scale coordinates according to image resize ratio (matching C++ version) param: img: OpenCV image (numpy array) bbox: [left, top, right, bottom] return: [x, y, width, height] """ l_x, t_y, r_x, b_y = bbox r_w = INPUT_W / img.shape[1] # INPUT_W should be your model input width r_h = INPUT_H / img.shape[0] # INPUT_H should be your model input height if r_h > r_w: l_x = l_x r_x = r_x t_y = t_y - (INPUT_H - r_w * img.shape[0]) / 2 b_y = b_y - (INPUT_H - r_w * img.shape[0]) / 2 l_x = l_x / r_w r_x = r_x / r_w t_y = t_y / r_w b_y = b_y / r_w else: l_x = l_x - (INPUT_W - r_h * img.shape[1]) / 2 r_x = r_x - (INPUT_W - r_h * img.shape[1]) / 2 t_y = t_y b_y = b_y l_x = l_x / r_h r_x = r_x / r_h t_y = t_y / r_h b_y = b_y / r_h l_x = max(0.0, l_x) t_y = max(0.0, t_y) width = max(0, min(int(round(r_x - l_x)), img.shape[1] - int(round(l_x)))) height = max(0, min(int(round(b_y - t_y)), img.shape[0] - int(round(t_y)))) return [int(round(l_x)), int(round(t_y)), width, height] def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one rotated bounding box on image img param: x: a box in [cx, cy, w, h, angle] format img: an opencv image object color: color to draw rectangle label: str line_thickness: int """ tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # Get four corner points corners = get_corner(img, x) corners = corners.astype(int) # Draw the rotated rectangle cv2.polylines(img, [corners], isClosed=True, color=color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness # Use first corner point for label placement p1 = tuple(corners[0]) w, h = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] outside = p1[1] - h >= 3 p2 = (p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3) cv2.rectangle(img, p1, p2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA ) class YoLov8TRT(object): """ description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size self.det_output_length = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): keep = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(keep)): box = keep[j] # type: Detection np.random.seed(int(keep[j].class_id)) color = [np.random.randint(0, 255) for _ in range(3)] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(keep[j].class_id)], keep[j].score ), color=color, line_thickness=1 ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def covariance_matrix(self, res: Detection): """ description: Generating covariance matrix from obbs. param: box (np.ndarray): A numpy array representing rotated bounding box, with xywhr format. return: tuple: (a, b, c) values of covariance matrix """ w = res.bbox[2] h = res.bbox[3] angle = res.angle a = w * w / 12.0 b = h * h / 12.0 c = angle cos_r = math.cos(c) sin_r = math.sin(c) cos_r2 = cos_r * cos_r sin_r2 = sin_r * sin_r a_val = a * cos_r2 + b * sin_r2 b_val = a * sin_r2 + b * cos_r2 c_val = (a - b) * cos_r * sin_r return a_val, b_val, c_val def probiou(self, box1: Detection, box2: Detection, eps=1e-7): """ description: Calculate the prob IoU between oriented bounding boxes. param: box1 (np.ndarray): First box in xywhr format box2 (np.ndarray): Second box in xywhr format eps (float): Small value to avoid division by zero return: float: 1 - hd where hd is the Bhattacharyya distance """ a1, b1, c1 = self.covariance_matrix(box1) a2, b2, c2 = self.covariance_matrix(box2) x1, y1 = box1.bbox[0], box1.bbox[1] x2, y2 = box2.bbox[0], box2.bbox[1] t1 = ((a1 + a2) * (y1 - y2) ** 2 + (b1 + b2) * (x1 - x2) ** 2) / \ ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps) t1 *= 0.25 t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / \ ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2 + eps) t2 *= 0.5 t3 = ((a1 + a2) * (b1 + b2) - (c1 + c2) ** 2) / \ (4 * math.sqrt(max(a1 * b1 - c1 * c1, 0.0)) * math.sqrt(max(a2 * b2 - c2 * c2, 0.0)) + eps) t3 = math.log(t3 + eps) * 0.5 bd = max(min(t1 + t2 + t3, 100.0), eps) hd = math.sqrt(1.0 - math.exp(-bd) + eps) return 1 - hd def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id,angle cx,cy,w,h,conf,cls_id,angle ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2, angle] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Filter by confidence threshold mask = pred[:, 4] >= CONF_THRESH pred = pred[mask] if len(pred) == 0: return [] m_map = {} for i in range(len(pred)): class_id = int(pred[i][5]) if class_id not in m_map: m_map[class_id] = [] m_map[class_id].append(Detection(pred[i][:4], pred[i][4], class_id, pred[i][89])) res = [] for it in m_map: dets = m_map[it] dets = sorted(dets, key=lambda x: x.score, reverse=True) for m in range(len(dets)): if dets[m].score == 0.0: continue item = dets[m] res.append(item) for n in range(m + 1, len(dets)): if dets[n].score == 0.0: continue if self.probiou(item, dets[n]) > IOU_THRESHOLD: dets[n].score = 0.0 keep = [] for i in range(len(res)): if res[i].score > CONF_THRESH: keep.append(res[i]) return keep class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolov8n-obb.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load DOTAV 1.5 labels categories = ["plane", "ship", "storage tank", "baseball diamond", "tennis court", "basketball court", "ground track field", "harbor", "bridge", "large vehicle", "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool", "container crane"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov8TRT instance yolov8_wrapper = YoLov8TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov8/yolov8_pose.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; if (is_p == 6) { serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (is_p == 2) { std::cout << "p2 is not supported right now" << std::endl; } else { serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueue(batchsize, buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { wts = std::string(argv[2]); engine = std::string(argv[3]); auto sub_type = std::string(argv[4]); if (sub_type[0] == 'n') { gd = 0.33; gw = 0.25; max_channels = 1024; } else if (sub_type[0] == 's') { gd = 0.33; gw = 0.50; max_channels = 1024; } else if (sub_type[0] == 'm') { gd = 0.67; gw = 0.75; max_channels = 576; } else if (sub_type[0] == 'l') { gd = 1.0; gw = 1.0; max_channels = 512; } else if (sub_type[0] == 'x') { gd = 1.0; gw = 1.25; max_channels = 640; } else { return false; } if (sub_type.size() == 2 && sub_type[1] == '6') { is_p = 6; } else if (sub_type.size() == 2 && sub_type[1] == '2') { is_p = 2; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string sub_type = ""; std::string cuda_post_process = ""; int model_bboxes; int is_p = 0; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " "plan file" << std::endl; std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { // Process gpu decode and nms results // todo pose in gpu std::cerr << "pose_postprocess is not support in gpu right now" << std::endl; } // Draw bounding boxes draw_bbox_keypoints_line(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov8/yolov8_pose_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 keypoint_pairs = [ (0, 1), (0, 2), (0, 5), (0, 6), (1, 2), (1, 3), (2, 4), (5, 6), (5, 7), (5, 11), (6, 8), (6, 12), (7, 9), (8, 10), (11, 12), (11, 13), (12, 14), (13, 15), (14, 16) ] def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov8 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov8TRT(object): """ description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size self.det_output_size = host_outputs[0].shape[0] def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid, keypoints = self.post_process( output[i * (self.det_output_size): (i + 1) * (self.det_output_size)], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) num_keypoints = len(keypoints[j]) // 3 points = [] for k in range(num_keypoints): x = keypoints[j][k * 3] y = keypoints[j][k * 3 + 1] confidence = keypoints[j][k * 3 + 2] if confidence > 0: points.append((int(x), int(y))) else: points.append(None) # 根据关键点索引对绘制线条 for pair in keypoint_pairs: partA, partB = pair if points[partA] and points[partB]: cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints): n = len(boxes) box_array = np.zeros_like(boxes) keypoint_array = np.zeros_like(keypoints) r_w = self.input_w / origin_w r_h = self.input_h / origin_h for i in range(n): if r_h > r_w: box = boxes[i] lmk = keypoints[i] box_array[i, 0] = box[0] / r_w box_array[i, 2] = box[2] / r_w box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w for j in range(0, len(lmk), 3): keypoint_array[i, j] = lmk[j] / r_w keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w keypoint_array[i, j + 2] = lmk[j + 2] else: box = boxes[i] lmk = keypoints[i] box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h box_array[i, 1] = box[1] / r_h box_array[i, 3] = box[3] / r_h for j in range(0, len(lmk), 3): keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h keypoint_array[i, j + 1] = lmk[j + 1] / r_h keypoint_array[i, j + 2] = lmk[j + 2] return box_array, keypoint_array def post_process(self, output, origin_h, origin_w): """ description: Post-process the prediction to include pose keypoints param: output: A numpy array like [num_boxes, cx, cy, w, h, conf, cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint origin_h: Height of original image origin_w: Width of original image return: result_boxes: Final boxes, a numpy array, each row is a box [x1, y1, x2, y2] result_scores: Final scores, a numpy array, each element is the score corresponding to box result_classid: Final classID, a numpy array, each element is the classid corresponding to box result_keypoints: Final keypoints, a list of numpy arrays, each element represents keypoints for a box, shaped as (#keypoints, 3) """ # Number of values per detection: 38 base values + 17 keypoints * 3 values each + angle num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + OBB_NUM # Get the number of boxes detected num = int(output[0]) # Reshape to a two-dimensional ndarray with the full detection shape pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] # Perform non-maximum suppression to filter the detections boxes = self.non_max_suppression( pred[:, :num_values_per_detection], origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) # Extract the bounding boxes, confidence scores, and class IDs result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) result_keypoints = boxes[:, -POSE_NUM-1:-1] if len(boxes) else np.array([]) # Return the post-processed results including keypoints return result_boxes, result_scores, result_classid, result_keypoints def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip( inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] res_array = np.copy(boxes) box_pred_deep_copy = np.copy(boxes[:, :4]) keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM-1:-1]) res_box, res_keypoints = self.xywh2xyxy_with_keypoints( origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy) res_array[:, :4] = res_box res_array[:, -POSE_NUM-1:-1] = res_keypoints # clip the coordinates res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1) res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1) res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1) res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1) # Object confidence confs = res_array[:, 4] # Sort by the confs res_array = res_array[np.argsort(-confs)] # Perform non-maximum suppression keep_res_array = [] while res_array.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres label_match = res_array[0, 5] == res_array[:, 5] invalid = large_overlap & label_match keep_res_array.append(res_array[0]) res_array = res_array[~invalid] res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([]) return res_array class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolov8n-pose.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov8TRT instance yolov8_wrapper = YoLov8TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov8/yolov8_seg.cpp ================================================ #include #include #include #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4); static cv::Rect get_downscale_rect(float bbox[4], float scale) { float left = bbox[0]; float top = bbox[1]; float right = bbox[0] + bbox[2]; float bottom = bbox[1] + bbox[3]; left = left < 0 ? 0 : left; top = top < 0 ? 0 : top; right = right > kInputW ? kInputW : right; bottom = bottom > kInputH ? kInputH : bottom; left /= scale; top /= scale; right /= scale; bottom /= scale; return cv::Rect(int(left), int(top), int(right - left), int(bottom - top)); } std::vector process_mask(const float* proto, int proto_size, std::vector& dets) { std::vector masks; for (size_t i = 0; i < dets.size(); i++) { cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); auto r = get_downscale_rect(dets[i].bbox, 4); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float e = 0.0f; for (int j = 0; j < 32; j++) { e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; } e = 1.0f / (1.0f + expf(-e)); mask_mat.at(y, x) = e; } } cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); masks.push_back(mask_mat); } return masks; } void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 3); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); const int outputIndex_seg = engine->getBindingIndex("proto"); assert(inputIndex == 0); assert(outputIndex == 1); assert(outputIndex_seg == 2); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueue(batchsize, buffers, stream, nullptr); if (cuda_post_process == "c") { std::cout << "kOutputSize:" << kOutputSize << std::endl; CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl; CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& sub_type, std::string& cuda_post_process, std::string& labels_filename, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); sub_type = std::string(argv[4]); if (sub_type == "n") { gd = 0.33; gw = 0.25; max_channels = 1024; } else if (sub_type == "s") { gd = 0.33; gw = 0.50; max_channels = 1024; } else if (sub_type == "m") { gd = 0.67; gw = 0.75; max_channels = 576; } else if (sub_type == "l") { gd = 1.0; gw = 1.0; max_channels = 512; } else if (sub_type == "x") { gd = 1.0; gw = 1.25; max_channels = 640; } else { return false; } } else if (std::string(argv[1]) == "-d" && argc == 6) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); cuda_post_process = std::string(argv[4]); labels_filename = std::string(argv[5]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; std::string img_dir; std::string sub_type = ""; std::string cuda_post_process = ""; std::string labels_filename = "../coco.txt"; int model_bboxes; float gd = 0.0f, gw = 0.0f; int max_channels = 0; if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; std::cerr << "./yolov8 -d [.engine] ../samples [c/g] coco_file// deserialize plan file and run inference" << std::endl; return -1; } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers float* device_buffers[3]; float* output_buffer_host = nullptr; float* output_seg_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } std::unordered_map labels_map; read_labels(labels_filename, labels_map); assert(kNumClass == labels_map.size()); prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); for (size_t b = 0; b < img_batch.size(); b++) { auto& res = res_batch[b]; cv::Mat img = img_batch[b]; auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res); draw_mask_bbox(img, res, masks, labels_map); cv::imwrite("_" + img_name_batch[b], img); } } else if (cuda_post_process == "g") { // Process gpu decode and nms results // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); // todo seg in gpu std::cerr << "seg_postprocess is not support in gpu right now" << std::endl; } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(device_buffers[2])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; delete[] output_seg_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution // std::cout << "\nOutput:\n\n"; // for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} // std::cout << std::endl; return 0; } ================================================ FILE: yolov8/yolov8_seg_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 POSE_NUM = 17 * 3 DET_NUM = 6 SEG_NUM = 32 OBB_NUM = 1 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from YoLov8 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class YoLov8TRT(object): """ description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size # Data length self.det_output_length = host_outputs[0].shape[0] self.seg_output_length = host_outputs[1].shape[0] self.seg_w = int(self.input_w / 4) self.seg_h = int(self.input_h / 4) self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w)) self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + OBB_NUM # Draw mask self.colors_obj = Colors() def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] output_proto_mask = host_outputs[1] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid, result_proto_coef = self.post_process( output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], batch_origin_w[i] ) if result_proto_coef.shape[0] == 0: continue result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i], batch_origin_w[i]) self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid], im_src=batch_image_raw[i]) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid, result_proto_coef def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, 5] == boxes[:, 5] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def scale_mask(self, mask, ih, iw): mask = cv2.resize(mask, (self.input_w, self.input_h)) r_w = self.input_w / (iw * 1.0) r_h = self.input_h / (ih * 1.0) if r_h > r_w: w = self.input_w h = int(r_w * ih) x = 0 y = int((self.input_h - h) / 2) else: w = int(r_h * iw) h = self.input_h x = int((self.input_w - w) / 2) y = 0 crop = mask[y:y + h, x:x + w] crop = cv2.resize(crop, (iw, ih)) return crop def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw): """ description: Mask pred by yolov8 instance segmentation , param: output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input result_proto_coef: prototype mask coefficients (n, 32), n represents n results result_boxes : ih: rows of original image iw: cols of original image return: mask_result: (n, ih, iw) """ result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w) c, mh, mw = result_proto_masks.shape print(result_proto_masks.shape) print(result_proto_coef.shape) masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, mw) mask_result = [] for mask, box in zip(masks, result_boxes): mask_s = np.zeros((ih, iw)) crop_mask = self.scale_mask(mask, ih, iw) x1 = int(box[0]) y1 = int(box[1]) x2 = int(box[2]) y2 = int(box[3]) crop = crop_mask[y1:y2, x1:x2] crop = np.where(crop >= 0.5, 1, 0) crop = crop.astype(np.uint8) mask_s[y1:y2, x1:x2] = crop mask_result.append(mask_s) mask_result = np.array(mask_result) return mask_result def draw_mask(self, masks, colors_, im_src, alpha=0.5): """ description: Draw mask on image , param: masks : result_mask colors_: color to draw mask im_src : original image alpha : scale between original image and mask return: no return """ if len(masks) == 0: return masks = np.asarray(masks, dtype=np.uint8) masks = np.ascontiguousarray(masks.transpose(1, 2, 0)) masks = np.asarray(masks, dtype=np.float32) colors_ = np.asarray(colors_, dtype=np.float32) s = masks.sum(2, keepdims=True).clip(0, 1) masks = (masks @ colors_).clip(0, 255) im_src[:] = masks * alpha + im_src * (1 - s * alpha) class inferThread(threading.Thread): def __init__(self, yolov8_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov8_wrapper): threading.Thread.__init__(self) self.yolov8_wrapper = yolov8_wrapper def run(self): batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) class Colors: def __init__(self): hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7') self.palette = [self.hex2rgb(f'#{c}') for c in hexs] self.n = len(self.palette) def __call__(self, i, bgr=False): c = self.palette[int(i) % self.n] return (c[2], c[1], c[0]) if bgr else c @staticmethod def hex2rgb(h): # rgb order (PIL) return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "./build/libmyplugins.so" engine_file_path = "yolov8n-seg.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov8TRT instance yolov8_wrapper = YoLov8TRT(engine_file_path) try: print('batch size is', yolov8_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov8_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov8_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov8_wrapper.destroy() ================================================ FILE: yolov9/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project(TRTCreater) add_definitions(-w) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_ARCHITECTURES 75 86 89) MESSAGE(STATUS "operation system is ${CMAKE_SYSTEM}") IF (CMAKE_SYSTEM_NAME MATCHES "Linux") MESSAGE(STATUS "current platform: Linux ") set(CUDA_COMPILER_PATH "/usr/local/cuda/bin/nvcc") set(TENSORRT_PATH "/home/benol/Package/TensorRT-8.6.1.6") include_directories(/usr/local/cuda/include) link_directories(/usr/local/cuda/lib64) link_directories(/usr/local/cuda/lib) ELSEIF (CMAKE_SYSTEM_NAME MATCHES "Windows") MESSAGE(STATUS "current platform: Windows") set(CUDA_COMPILER_PATH "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe") set(TENSORRT_PATH "D:\\Program Files\\TensorRT-8.6.1.6") set(OpenCV_DIR "D:\\Program Files\\opencv\\build") include_directories(${PROJECT_SOURCE_DIR}/windows) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) link_directories(${CUDA_LIBRARIES}) ELSE (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") MESSAGE(STATUS "other platform: ${CMAKE_SYSTEM_PROCESSOR}") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux") set(CMAKE_CUDA_COMPILER ${CUDA_COMPILER_PATH}) enable_language(CUDA) # tensorrt include_directories(${TENSORRT_PATH}/include) link_directories(${TENSORRT_PATH}/lib) find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) include_directories(${PROJECT_SOURCE_DIR}/include/) include_directories(${PROJECT_SOURCE_DIR}/plugin/) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu) # add_library(myplugins SHARED ${PLUGIN_SRCS}) add_library(myplugins SHARED ${PLUGIN_SRCS}) target_link_libraries(myplugins nvinfer cudart) add_executable(yolov9 demo.cpp ${SRCS}) target_link_libraries(yolov9 nvinfer cudart myplugins ${OpenCV_LIBS}) ================================================ FILE: yolov9/README.md ================================================ # YOLOv9 The Pytorch implementation is [WongKinYiu/yolov9](https://github.com/WongKinYiu/yolov9). ## Contributors ## Progress - [x] YOLOv9-t - [x] YOLOv9-t-convert(gelan) - [x] YOLOv9-s - [x] YOLOv9-s-convert(gelan) - [x] YOLOv9-m - [x] YOLOv9-m-convert(gelan) - [x] YOLOv9-c - [x] YOLOv9-c-convert(gelan) - [x] YOLOv9-e - [x] YOLOv9-e-convert(gelan) ## Requirements - TensorRT 8.0+ - OpenCV 3.4.0+ ## Speed Test The speed test is done on a desktop with R7-5700G CPU and RTX 4060Ti GPU. The input size is 640x640. The FP32, FP16 and INT8 models are tested. The time only includes the inference time, not includes the pre-processing and post-processing. The time is the average of 1000 times inference. | frame | Model | FP32 | FP16 | INT8 | | --- | --- | --- | --- | --- | | tensorrt | YOLOv5-n | -ms | 0.58ms | -ms | | tensorrt | YOLOv5-s | -ms | 0.90ms | -ms | | tensorrt | YOLOv5-m | -ms | 1.9ms | -ms | | tensorrt | YOLOv5-l | -ms | 2.8ms | -ms | | tensorrt | YOLOv5-x | -ms | 5.1ms | -ms | | tensorrt | YOLOv9-t-convert | -ms | 1.37ms | -ms | | tensorrt | YOLOv9-s | -ms | 1.78ms | -ms | | tensorrt | YOLOv9-s-convert | -ms | 1.78ms | -ms | | tensorrt | YOLOv9-m | -ms | 3.1ms | -ms | | tensorrt | YOLOv9-m-convert | -ms | 2.8ms | -ms | | tensorrt | YOLOv9-c | 13.5ms | 4.6ms | 3.0ms | | tensorrt | YOLOv9-e | 8.3ms | 3.2ms | 2.15ms | **GELAN will be updated later.** YOLOv9-e is faster than YOLOv9-c in tensorrt, because the YOLOv9-e requires fewer layers of inference. ``` YOLOv9-c: [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]] # [A3, A4, A5, P3, P4, P5] YOLOv9-e: [[35, 32, 29, 42, 45, 48], 1, DualDDetect, [nc]] ``` In DualDDetect, the A3, A4, A5, P3, P4, P5 are the output of the backbone. The first 3 layers are used for the inference of the final result. The YOLOv9-c requires 37 layers of inference, but YOLOv9-e requires 35 layers of inference. ## How to Run, yolov9 as example 1. generate .wts from pytorch with .pt, or download .wts from model zoo ``` // download https://github.com/WongKinYiu/yolov9 cp {tensorrtx}/yolov9/gen_wts.py {yolov9}/yolov9 cd {yolov9}/yolov9 python gen_wts.py // a file 'yolov9.wts' will be generated. ``` 2. build tensorrtx/yolov9 and run ``` cd {tensorrtx}/yolov9/ // update kNumClass in config.h if your model is trained on custom dataset mkdir build cd build cp {ultralytics}/ultralytics/yolov9.wts {tensorrtx}/yolov9/build cmake .. make sudo ./yolov9 -s [.wts] [.engine] [c/e] // serialize model to plan file sudo ./yolov9 -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov9 sudo ./yolov9 -s yolov9-c.wts yolov9-c.engine c sudo ./yolov9 -d yolov9-c.engine ../images ``` 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 4. optional, load and run the tensorrt model in python ``` // install python-tensorrt, pycuda, etc. // ensure the yolov9.engine and libmyplugins.so have been built python yolov9_trt.py ``` # INT8 Quantization 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 2. unzip it in yolov9/build 3. set the macro `USE_INT8` in config.h and change the path of calibration images in config.h, such as 'gCalibTablePath="./coco_calib/";' 4. serialize the model and test

## More Information See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) ================================================ FILE: yolov9/demo.cpp ================================================ #include #include #include "config.h" #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" using namespace nvinfer1; const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; static Logger gLogger; void serialize_engine(unsigned int max_batchsize, std::string& wts_name, std::string& sub_type, std::string& engine_name) { // Create builder IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network, then set the outputs and create an engine IHostMemory* serialized_engine = nullptr; if (sub_type == "t") { serialized_engine = build_engine_yolov9_t(max_batchsize, builder, config, DataType::kFLOAT, wts_name, false); } else if (sub_type == "s") { serialized_engine = build_engine_yolov9_s(max_batchsize, builder, config, DataType::kFLOAT, wts_name, false); } else if (sub_type == "m") { serialized_engine = build_engine_yolov9_m(max_batchsize, builder, config, DataType::kFLOAT, wts_name, false); } else if (sub_type == "c") { serialized_engine = build_engine_yolov9_c(max_batchsize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "e") { serialized_engine = build_engine_yolov9_e(max_batchsize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "gt") { serialized_engine = build_engine_yolov9_t(max_batchsize, builder, config, DataType::kFLOAT, wts_name, true); } else if (sub_type == "gs") { serialized_engine = build_engine_yolov9_s(max_batchsize, builder, config, DataType::kFLOAT, wts_name, true); } else if (sub_type == "gm") { serialized_engine = build_engine_yolov9_m(max_batchsize, builder, config, DataType::kFLOAT, wts_name, true); } else if (sub_type == "gc") { serialized_engine = build_engine_gelan_c(max_batchsize, builder, config, DataType::kFLOAT, wts_name); } else if (sub_type == "ge") { serialized_engine = build_engine_gelan_e(max_batchsize, builder, config, DataType::kFLOAT, wts_name); } else { return; } assert(serialized_engine != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; assert(false); } p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete config; delete serialized_engine; delete builder; } void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); *output_buffer_host = new float[kBatchSize * kOutputSize]; } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchSize) { // infer on the batch asynchronously, and DMA output back to host context.enqueue(batchSize, buffers, stream, nullptr); CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); } bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, std::string& sub_type) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { wts = std::string(argv[2]); engine = std::string(argv[3]); sub_type = std::string(argv[4]); } else if (std::string(argv[1]) == "-d" && argc == 4) { engine = std::string(argv[2]); img_dir = std::string(argv[3]); } else { return false; } return true; } int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = "../yolov9-m-converted.engine"; std::string img_dir = "../images"; std::string sub_type = "m"; // speed test or inference const int speed_test_iter = 1000; // const int speed_test_iter = 1; // if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type)) { // std::cerr << "Arguments not right!" << std::endl; // std::cerr << "./yolov9 -s [.wts] [.engine] [s/m/c/e/gt/gs/gm/gc/ge] // serialize model to plan file" << std::endl; // std::cerr << "./yolov9 -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; // return -1; // } // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { serialize_engine(kBatchSize, wts_name, sub_type, engine_name); return 0; } // Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); // Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host); // Read images from directory std::vector file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // Get a batch of images std::vector img_batch; std::vector img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference auto start = std::chrono::system_clock::now(); for (int j = 0; j < speed_test_iter; j++) { infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize); } // infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() / 1000.0 / speed_test_iter << "ms" << std::endl; // NMS std::vector> res_batch; batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); // Draw bounding boxes draw_bbox(img_batch, res_batch); // Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; } ================================================ FILE: yolov9/gen_wts.py ================================================ import sys # noqa: F401 import argparse import os import struct import torch from utils.torch_utils import select_device def parse_args(): parser = argparse.ArgumentParser(description='Convert .pt file to .wts') parser.add_argument('-w', '--weights', default='yolov9-e.pt', help='Input weights (.pt) file path (required)') parser.add_argument( '-o', '--output', help='Output (.wts) file path (optional)') parser.add_argument( '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg'], help='determines the model is detection/classification') args = parser.parse_args() if not os.path.isfile(args.weights): raise SystemExit('Invalid input file') if not args.output: args.output = os.path.splitext(args.weights)[0] + '.wts' elif os.path.isdir(args.output): args.output = os.path.join( args.output, os.path.splitext(os.path.basename(args.weights))[0] + '.wts') return args.weights, args.output, args.type pt_file, wts_file, m_type = parse_args() print(f'Generating .wts for {m_type} model') # Load model print(f'Loading {pt_file}') device = select_device('cpu') model = torch.load(pt_file, map_location=device, weights_only=False) # Load FP32 weights model = model['ema' if model.get('ema') else 'model'].float() if m_type in ['detect', 'seg']: # update anchor_grid info anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] # model.model[-1].anchor_grid = anchor_grid # delattr(model.model[-1], 'anchor_grid') # model.model[-1] is detect layer # The parameters are saved in the OrderDict through the "register_buffer" method, and then saved to the weight. model.model[-1].register_buffer("anchor_grid", anchor_grid) # model.model[-1].register_buffer("strides", model.model[-1].stride) model.to(device).eval() # print(model.model) # 将model.model保存到txt中 with open('model.txt', 'w') as f: f.write(str(model.model)) f.close() print(f'Writing into {wts_file}') with open(wts_file, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) for vv in vr: f.write(' ') f.write(struct.pack('>f', float(vv)).hex()) f.write('\n') wts_file_key = wts_file.replace('.wts', '_key.txt') print(f'Writing into {wts_file_key}') with open(wts_file_key, 'w') as f: f.write('{}\n'.format(len(model.state_dict().keys()))) for k, v in model.state_dict().items(): vr = v.reshape(-1).cpu().numpy() f.write('{} {} '.format(k, len(vr))) f.write('\n') ================================================ FILE: yolov9/include/block.h ================================================ #include "config.h" #include "yololayer.h" #include #include #include #include #include #include using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] void PrintDim(const ILayer* layer, std::string log = ""); std::map loadWeights(const std::string file); int get_width(int x, float gw, int divisor = 8); int get_depth(int x, float gd); ILayer* Proto(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c_, int c2, std::string lname); std::vector> getAnchors(std::map& weightMap, std::string lname); // ---------------------------------------------------------------- nvinfer1::ILayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname, int g = 1); ILayer* ELAN1(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int c3, int c4, std::string lname); ILayer* RepNCSPELAN4(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int c3, int c4, int c5, std::string lname); ILayer* ADown(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, std::string lname); ILayer* AConv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, std::string lname); std::vector CBLinear(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::vector c2s, int k, int s, int p, int g, std::string lname); ILayer* CBFuse(INetworkDefinition* network, std::vector> input, std::vector idx, std::vector strides); ILayer* SPPELAN(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int c3, std::string lname); std::vector DualDDetect(INetworkDefinition* network, std::map& weightMap, std::vector dets, int cls, std::vector ch, std::string lname); nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, bool is_segmentation); nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname); nvinfer1::ILayer* convBnNoAct(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname, int g); std::vector DDetect(INetworkDefinition* network, std::map& weightMap, std::vector dets, int cls, std::vector ch, std::string lname); ================================================ FILE: yolov9/include/calibrator.h ================================================ #pragma once #include "macros.h" #include #include //! \class Int8EntropyCalibrator2 //! //! \brief Implements Entropy calibrator 2. //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. //! class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { public: Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); virtual ~Int8EntropyCalibrator2(); int getBatchSize() const TRT_NOEXCEPT override; bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; private: int batchsize_; int input_w_; int input_h_; int img_idx_; std::string img_dir_; std::vector img_files_; size_t input_count_; std::string calib_table_name_; const char* input_blob_name_; bool read_cache_; void* device_input_; std::vector calib_cache_; }; ================================================ FILE: yolov9/include/config.h ================================================ #pragma once /* -------------------------------------------------------- * These configs are related to tensorrt model, if these are changed, * please re-compile and re-serialize the tensorrt model. * --------------------------------------------------------*/ // For INT8, you need prepare the calibration dataset, please refer to // https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 #ifdef USE_INT8 const static char* gCalibTablePath = "./calib"; #endif // These are used to define input/output tensor names, // you can set them to whatever you want. const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; // Detection model and Segmentation model' number of classes constexpr static int kNumClass = 80; // Classfication model's number of classes constexpr static int kClsNumClass = 1000; constexpr static int kBatchSize = 1; // Yolo's input width and height must by divisible by 32 constexpr static int kInputH = 640; constexpr static int kInputW = 640; // Classfication model's input shape constexpr static int kClsInputH = 224; constexpr static int kClsInputW = 224; // Maximum number of output bounding boxes from yololayer plugin. // That is maximum number of output bounding boxes before NMS. constexpr static int kMaxNumOutputBbox = 2000; constexpr static int kNumAnchor = 3; // The bboxes whose confidence is lower than kIgnoreThresh will be ignored in yololayer plugin. constexpr static float kIgnoreThresh = 0.05f; /* -------------------------------------------------------- * These configs are NOT related to tensorrt model, if these are changed, * please re-compile, but no need to re-serialize the tensorrt model. * --------------------------------------------------------*/ // NMS overlapping thresh and final detection confidence thresh const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.1f; const static int kGpuId = 0; // If your image size is larger than 4096 * 3112, please increase this value const static int kMaxInputImageSize = 4096 * 3112; ================================================ FILE: yolov9/include/cuda_utils.h ================================================ #ifndef TRTX_CUDA_UTILS_H_ #define TRTX_CUDA_UTILS_H_ #include #ifndef CUDA_CHECK #define CUDA_CHECK(callstr)\ {\ cudaError_t error_code = callstr;\ if (error_code != cudaSuccess) {\ std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ assert(0);\ }\ } #endif // CUDA_CHECK #endif // TRTX_CUDA_UTILS_H_ ================================================ FILE: yolov9/include/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include #include "macros.h" using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) TRT_NOEXCEPT override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: yolov9/include/macros.h ================================================ #ifndef __MACROS_H #define __MACROS_H #include #ifdef API_EXPORTS #if defined(_MSC_VER) #define API __declspec(dllexport) #else #define API __attribute__((visibility("default"))) #endif #else #if defined(_MSC_VER) #define API __declspec(dllimport) #else #define API #endif #endif // API_EXPORTS #if NV_TENSORRT_MAJOR >= 8 #define TRT_NOEXCEPT noexcept #define TRT_CONST_ENQUEUE const #else #define TRT_NOEXCEPT #define TRT_CONST_ENQUEUE #endif #endif // __MACROS_H ================================================ FILE: yolov9/include/model.h ================================================ #pragma once #include #include // yolov9 nvinfer1::IHostMemory* build_engine_yolov9_t(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name, bool isConvert = false); nvinfer1::IHostMemory* build_engine_yolov9_s(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name, bool isConvert = false); nvinfer1::IHostMemory* build_engine_yolov9_m(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name, bool isConvert = false); nvinfer1::IHostMemory* build_engine_yolov9_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); nvinfer1::IHostMemory* build_engine_yolov9_e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); // gelan nvinfer1::IHostMemory* build_engine_gelan_t(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); nvinfer1::IHostMemory* build_engine_gelan_m(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); nvinfer1::IHostMemory* build_engine_gelan_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); nvinfer1::IHostMemory* build_engine_gelan_e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); ================================================ FILE: yolov9/include/postprocess.h ================================================ #pragma once #include "types.h" #include #include cv::Rect get_rect(cv::Mat& img, float bbox[4]); void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); void draw_bbox(std::vector& img_batch, std::vector>& res_batch); std::vector process_mask(const float* proto, int proto_size, std::vector& dets); void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); // cuda NMS void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); void batch_process(std::vector> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); ================================================ FILE: yolov9/include/preprocess.h ================================================ #pragma once #include #include #include void cuda_preprocess_init(int max_image_size); void cuda_preprocess_destroy(); void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream); void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream); ================================================ FILE: yolov9/include/types.h ================================================ #pragma once #include "config.h" struct YoloKernel { int width; int height; float anchors[kNumAnchor * 2]; }; struct alignas(float) Detection { float bbox[4]; // center_x center_y w h float conf; // bbox_conf * cls_conf float class_id; float mask[32]; }; const int bbox_element = 7; // center_x, center_y, w, h, conf, cls, obj ================================================ FILE: yolov9/include/utils.h ================================================ #pragma once #include #include #include #include #include #include #include static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { DIR* p_dir = opendir(p_dir_name); if (p_dir == nullptr) { return -1; } struct dirent* p_file = nullptr; while ((p_file = readdir(p_dir)) != nullptr) { if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { //std::string cur_file_name(p_dir_name); //cur_file_name += "/"; //cur_file_name += p_file->d_name; std::string cur_file_name(p_file->d_name); file_names.push_back(cur_file_name); } } closedir(p_dir); return 0; } // Function to trim leading and trailing whitespace from a string static inline std::string trim_leading_whitespace(const std::string& str) { size_t first = str.find_first_not_of(' '); if (std::string::npos == first) { return str; } size_t last = str.find_last_not_of(' '); return str.substr(first, (last - first + 1)); } // Src: https://stackoverflow.com/questions/16605967 static inline std::string to_string_with_precision(const float a_value, const int n = 2) { std::ostringstream out; out.precision(n); out << std::fixed << a_value; return out.str(); } static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { std::ifstream file(labels_filename); // Read each line of the file std::string line; int index = 0; while (std::getline(file, line)) { // Strip the line of any leading or trailing whitespace line = trim_leading_whitespace(line); // Add the stripped line to the labels_map, using the loop index as the key labels_map[index] = line; index++; } // Close the file file.close(); return 0; } ================================================ FILE: yolov9/plugin/yololayer.cu ================================================ #include "yololayer.h" #include "types.h" #include #include #include "cuda_utils.h" #include #include namespace Tn { template void write(char*& buffer, const T& val) { *reinterpret_cast(buffer) = val; buffer += sizeof(T); } template void read(const char*& buffer, T& val) { val = *reinterpret_cast(buffer); buffer += sizeof(T); } } // namespace Tn namespace nvinfer1 { YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation) { mClassCount = classCount; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; is_segmentation_ = is_segmentation; } YoloLayerPlugin::~YoloLayerPlugin() {} YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char* d = reinterpret_cast(data), * a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); read(d, is_segmentation_); assert(d == a + length); } void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char* d = static_cast(buffer), * a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); write(d, is_segmentation_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(is_segmentation_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { mPluginNamespace = pluginNamespace; } const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { return false; } bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { return false; } void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {}; void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_); p->setPluginNamespace(mPluginNamespace); return p; } int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; int total_grid = grid_h * grid_w; int info_len = 4 + classes; if (is_segmentation) info_len += 32; int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; int outputIdx = batchIdx * outputElem; int class_id = 0; float max_cls_prob = 0.0; for (int i = 4; i < 4 + classes; i++) { float p = Logist(curInput[elemIdx + i * total_grid]); if (p > max_cls_prob) { max_cls_prob = p; class_id = i - 4; } } if (max_cls_prob < 0.1) return; int count = (int)atomicAdd(output + outputIdx, 1); if (count >= maxoutobject) return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); int row = elemIdx / grid_w; int col = elemIdx % grid_w; det->conf = max_cls_prob; det->class_id = class_id; det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; for (int k = 0; is_segmentation && k < 32; k++) { det->mask[k] = curInput[elemIdx + (k + 4 + classes) * total_grid]; } } void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} }; int strides[] = { 8, 16, 32 }; for (unsigned int i = 0; i < 3; i++) { int grid_h = grids[i][0]; int grid_w = grids[i][1]; int stride = strides[i]; numElem = grid_h * grid_w * batchSize; if (numElem < mThreadCount) mThreadCount = numElem; CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> > (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, is_segmentation_); } } PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; YoloPluginCreator::YoloPluginCreator() { mPluginAttributes.clear(); mFC.nbFields = mPluginAttributes.size(); mFC.fields = mPluginAttributes.data(); } const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { return "YoloLayer_TRT"; } const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { return "1"; } const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { return &mFC; } IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "netinfo") == 0); int* p_netinfo = (int*)(fc->fields[0].data); int class_count = p_netinfo[0]; int input_w = p_netinfo[1]; int input_h = p_netinfo[2]; int max_output_object_count = p_netinfo[3]; bool is_segmentation = p_netinfo[4]; YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation); obj->setPluginNamespace(mNamespace.c_str()); return obj; } IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); return obj; } } // namespace nvinfer1 ================================================ FILE: yolov9/plugin/yololayer.h ================================================ #pragma once #include "macros.h" #include "NvInfer.h" #include #include #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut, bool is_segmentation); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); int getNbOutputs() const TRT_NOEXCEPT override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; int initialize() TRT_NOEXCEPT override; virtual void terminate() TRT_NOEXCEPT override {} virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; virtual size_t getSerializationSize() const TRT_NOEXCEPT override; virtual void serialize(void* buffer) const TRT_NOEXCEPT override; bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } const char* getPluginType() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; void destroy() TRT_NOEXCEPT override; IPluginV2IOExt* clone() const TRT_NOEXCEPT override; void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; const char* getPluginNamespace() const TRT_NOEXCEPT override; nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; void detachFromContext() TRT_NOEXCEPT override; private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; int mYoloV8NetWidth; int mYoloV8netHeight; int mMaxOutObject; bool is_segmentation_; }; class API YoloPluginCreator : public IPluginCreator { public: YoloPluginCreator(); ~YoloPluginCreator() override = default; const char* getPluginName() const TRT_NOEXCEPT override; const char* getPluginVersion() const TRT_NOEXCEPT override; const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } private: std::string mNamespace; static PluginFieldCollection mFC; static std::vector mPluginAttributes; }; REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 ================================================ FILE: yolov9/src/block.cpp ================================================ #include "block.h" #include "calibrator.h" #include "config.h" #include "yololayer.h" #include #include #include #include #include #include #include #include #include using namespace nvinfer1; // TensorRT weight files have a simple space delimited format: // [type] [size] void PrintDim(const ILayer* layer, std::string log) { Dims dim = layer->getOutput(0)->getDimensions(); std::cout << log << ": " << "\t\t\t\t"; for (int i = 0; i < dim.nbDims; i++) { std::cout << dim.d[i] << " "; } std::cout << std::endl; } std::map loadWeights(const std::string file) { std::cout << "Loading weights: " << file << std::endl; std::map weightMap; // Open weights file std::ifstream input(file); assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); // Read number of weight blobs int32_t count; input >> count; assert(count > 0 && "Invalid weight map file."); while (count--) { Weights wt{DataType::kFLOAT, nullptr, 0}; uint32_t size; // Read name and type of blob std::string name; input >> name >> std::dec >> size; wt.type = DataType::kFLOAT; // Load blob uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); for (uint32_t x = 0, y = size; x < y; ++x) { input >> std::hex >> val[x]; } wt.values = val; wt.count = size; weightMap[name] = wt; } return weightMap; } int get_width(int x, float gw, int divisor) { return int(ceil((x * gw) / divisor)) * divisor; } int get_depth(int x, float gd) { if (x == 1) return 1; int r = round(x * gd); if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) { --r; } return std::max(r, 1); } static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, std::string lname, float eps) { float* gamma = (float*)weightMap[lname + ".weight"].values; float* beta = (float*)weightMap[lname + ".bias"].values; float* mean = (float*)weightMap[lname + ".running_mean"].values; float* var = (float*)weightMap[lname + ".running_var"].values; int len = weightMap[lname + ".running_var"].count; float* scval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { scval[i] = gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; float* shval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); } nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; float* pval = reinterpret_cast(malloc(sizeof(float) * len)); for (int i = 0; i < len; i++) { pval[i] = 1.0; } nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; weightMap[lname + ".scale"] = scale; weightMap[lname + ".shift"] = shift; weightMap[lname + ".power"] = power; nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); assert(output); return output; } nvinfer1::ILayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname, int g) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(sigmoid); auto ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } nvinfer1::ILayer* convBnNoAct(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname, int g) { nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); assert(conv); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); conv->setNbGroups(g); nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); return bn; } std::vector> getAnchors(std::map& weightMap, std::string lname) { std::vector> anchors; Weights wts = weightMap[lname + ".anchor_grid"]; int anchor_len = kNumAnchor * 2; for (int i = 0; i < wts.count / anchor_len; i++) { auto* p = (const float*)wts.values + i * anchor_len; std::vector anchor(p, p + anchor_len); anchors.push_back(anchor); } return anchors; } ILayer* RepConvN(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int k, int s, int p, int g, int d, bool act, bool bn, bool deploy, std::string lname) { assert(k == 3 && p == 1); ILayer* conv1 = convBnNoAct(network, weightMap, input, c2, k, s, p, lname + ".conv1", g); ILayer* conv2 = convBnNoAct(network, weightMap, input, c2, 1, s, p - k / 2, lname + ".conv2", g); ILayer* ew0 = network->addElementWise(*conv1->getOutput(0), *conv2->getOutput(0), ElementWiseOperation::kSUM); nvinfer1::IActivationLayer* sigmoid = network->addActivation(*ew0->getOutput(0), nvinfer1::ActivationType::kSIGMOID); assert(sigmoid); auto ew = network->addElementWise(*ew0->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); assert(ew); return ew; } ILayer* RepNBottleneck(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int k, int g, float e, std::string lname) { int c_ = int(c2 * e); assert(k == 3 && "RepVGG only support kernel size 3"); auto cv1 = RepConvN(network, weightMap, input, c1, c_, k, 1, 1, g, 1, true, false, false, lname + ".cv1"); auto cv2 = convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, k, 1, 1, lname + ".cv2", g); if (shortcut && c1 == c2) { auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); return ew; } return cv2; } ILayer* RepNCSP(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { int c_ = int(c2 * e); auto cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1", 1); ILayer* m = cv1; for (int i = 0; i < n; i++) { m = RepNBottleneck(network, weightMap, *m->getOutput(0), c_, c_, shortcut, 3, g, 1.0, lname + ".m." + std::to_string(i)); } // auto m_0 = RepNBottleneck(network, weightMap, *cv1->getOutput(0), c_, c_, shortcut, 3, g, 1.0, lname + ".m.0"); // auto m_1 = RepNBottleneck(network, weightMap, *m_0->getOutput(0), c_, c_, shortcut, 3, g, 1.0, lname + ".m.1"); auto cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv2", 1); ITensor* inputTensors[] = {m->getOutput(0), cv2->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 2); auto cv3 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv3", 1); return cv3; } ILayer* ELAN1(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int c3, int c4, std::string lname) { auto cv1 = convBnSiLU(network, weightMap, input, c3, 1, 1, 0, lname + ".cv1", 1); // chunk(2, 1) nvinfer1::Dims d = cv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); auto cv2 = convBnSiLU(network, weightMap, *split2->getOutput(0), c4, 3, 1, 1, lname + ".cv2", 1); auto cv3 = convBnSiLU(network, weightMap, *cv2->getOutput(0), c4, 3, 1, 1, lname + ".cv3", 1); ITensor* inputTensors[] = {split1->getOutput(0), split2->getOutput(0), cv2->getOutput(0), cv3->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 4); auto cv4 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv4", 1); return cv4; } ILayer* RepNCSPELAN4(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int c3, int c4, int c5, std::string lname) { auto cv1 = convBnSiLU(network, weightMap, input, c3, 1, 1, 0, lname + ".cv1", 1); // chunk(2, 1) nvinfer1::Dims d = cv1->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*cv1->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); auto cv2_0 = RepNCSP(network, weightMap, *split2->getOutput(0), c3 / 2, c4, c5, true, 1, 0.5, lname + ".cv2.0"); auto cv2_1 = convBnSiLU(network, weightMap, *cv2_0->getOutput(0), c4, 3, 1, 1, lname + ".cv2.1", 1); auto cv3_0 = RepNCSP(network, weightMap, *cv2_1->getOutput(0), c4, c4, c5, true, 1, 0.5, lname + ".cv3.0"); auto cv3_1 = convBnSiLU(network, weightMap, *cv3_0->getOutput(0), c4, 3, 1, 1, lname + ".cv3.1", 1); ITensor* inputTensors[] = {split1->getOutput(0), split2->getOutput(0), cv2_1->getOutput(0), cv3_1->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 4); auto cv4 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv4", 1); return cv4; } ILayer* AConv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, std::string lname) { auto pool = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{2, 2}); pool->setStrideNd(DimsHW{1, 1}); pool->setPaddingNd(DimsHW{0, 0}); auto cv1 = convBnSiLU(network, weightMap, *pool->getOutput(0), c2, 3, 2, 1, lname + ".cv1", 1); return cv1; } ILayer* ADown(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, std::string lname) { int c_ = c2 / 2; auto pool = network->addPoolingNd(input, PoolingType::kAVERAGE, DimsHW{2, 2}); pool->setStrideNd(DimsHW{1, 1}); pool->setPaddingNd(DimsHW{0, 0}); nvinfer1::Dims d = pool->getOutput(0)->getDimensions(); nvinfer1::ISliceLayer* split1 = network->addSlice(*pool->getOutput(0), nvinfer1::Dims3{0, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); nvinfer1::ISliceLayer* split2 = network->addSlice(*pool->getOutput(0), nvinfer1::Dims3{d.d[0] / 2, 0, 0}, nvinfer1::Dims3{d.d[0] / 2, d.d[1], d.d[2]}, nvinfer1::Dims3{1, 1, 1}); // auto chunklayer = layer_split(1, pool->getOutput(0), network); auto cv1 = convBnSiLU(network, weightMap, *split1->getOutput(0), c_, 3, 2, 1, lname + ".cv1", 1); auto pool2 = network->addPoolingNd(*split2->getOutput(0), PoolingType::kMAX, DimsHW{3, 3}); pool2->setStrideNd(DimsHW{2, 2}); pool2->setPaddingNd(DimsHW{1, 1}); auto cv2 = convBnSiLU(network, weightMap, *pool2->getOutput(0), c_, 1, 1, 0, lname + ".cv2", 1); ITensor* inputTensors[] = {cv1->getOutput(0), cv2->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 2); return cat; } std::vector CBLinear(INetworkDefinition* network, std::map& weightMap, ITensor& input, std::vector c2s, int k, int s, int p, int g, std::string lname) { IConvolutionLayer* conv1 = network->addConvolutionNd(input, std::accumulate(c2s.begin(), c2s.end(), 0), DimsHW{k, k}, weightMap[lname + ".conv.weight"], weightMap[lname + ".conv.bias"]); assert(conv1); conv1->setName((lname + ".conv").c_str()); conv1->setStrideNd(DimsHW{s, s}); conv1->setPaddingNd(DimsHW{p, p}); int h = input.getDimensions().d[1]; int w = input.getDimensions().d[2]; std::vector slices(c2s.size()); int start = 0; for (int i = 0; i < c2s.size(); i++) { slices[i] = network->addSlice(*conv1->getOutput(0), Dims3{start, 0, 0}, Dims3{c2s[i], h, w}, Dims3{1, 1, 1}); start += c2s[i]; } return slices; } ILayer* CBFuse(INetworkDefinition* network, std::vector> input, std::vector idx, std::vector strides) { ILayer** res = new ILayer*[input.size()]; res[input.size() - 1] = input[input.size() - 1][0]; for (int i = input.size() - 2; i >= 0; i--) { auto upsample = network->addResize(*input[i][idx[i]]->getOutput(0)); upsample->setResizeMode(ResizeMode::kNEAREST); const float scales[] = {1, strides[i] / strides[strides.size() - 1], strides[i] / strides[strides.size() - 1]}; upsample->setScales(scales, 3); res[i] = upsample; } for (int i = 1; i < input.size(); i++) { auto ew = network->addElementWise(*res[0]->getOutput(0), *res[i]->getOutput(0), ElementWiseOperation::kSUM); res[0] = ew; } return res[0]; } ILayer* SP(INetworkDefinition* network, std::map& weightMap, ITensor& input, int k, int s) { int p = k / 2; auto pool = network->addPoolingNd(input, PoolingType::kMAX, DimsHW{k, k}); pool->setPaddingNd(DimsHW{p, p}); pool->setStrideNd(DimsHW{s, s}); return pool; } ILayer* SPPELAN(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c1, int c2, int c3, std::string lname) { auto cv1 = convBnSiLU(network, weightMap, input, c3, 1, 1, 0, lname + ".cv1", 1); auto cv2 = SP(network, weightMap, *cv1->getOutput(0), 5, 1); auto cv3 = SP(network, weightMap, *cv2->getOutput(0), 5, 1); auto cv4 = SP(network, weightMap, *cv3->getOutput(0), 5, 1); ITensor* inputTensors[] = {cv1->getOutput(0), cv2->getOutput(0), cv3->getOutput(0), cv4->getOutput(0)}; auto cat = network->addConcatenation(inputTensors, 4); auto cv5 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv5", 1); return cv5; } ILayer* DetectBbox_Conv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, int reg_max, std::string lname) { auto cv_0 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".0", 1); auto cv_1 = convBnSiLU(network, weightMap, *cv_0->getOutput(0), c2, 3, 1, 1, lname + ".1", 4); auto cv_2 = network->addConvolutionNd(*cv_1->getOutput(0), reg_max * 4, DimsHW{1, 1}, weightMap[lname + ".2.weight"], weightMap[lname + ".2.bias"]); cv_2->setName((lname + ".conv").c_str()); cv_2->setStrideNd(DimsHW{1, 1}); cv_2->setPaddingNd(DimsHW{0, 0}); cv_2->setNbGroups(4); return cv_2; } ILayer* DetectCls_Conv(INetworkDefinition* network, std::map& weightMap, ITensor& input, int c2, int cls, std::string lname) { auto cv_0 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".0", 1); auto cv_1 = convBnSiLU(network, weightMap, *cv_0->getOutput(0), c2, 3, 1, 1, lname + ".1", 1); auto cv_2 = network->addConvolutionNd(*cv_1->getOutput(0), cls, DimsHW{1, 1}, weightMap[lname + ".2.weight"], weightMap[lname + ".2.bias"]); cv_2->setName((lname + ".conv").c_str()); cv_2->setStrideNd(DimsHW{1, 1}); cv_2->setPaddingNd(DimsHW{0, 0}); return cv_2; } nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname) { auto dim = input.getDimensions(); int c = dim.d[0]; int grid = dim.d[1] * dim.d[2]; int split_num = c / ch; nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); shuffle1->setReshapeDimensions(nvinfer1::Dims3{split_num, ch, grid}); shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2}); nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".conv.weight"], bias_empty); conv->setStrideNd(nvinfer1::DimsHW{s, s}); conv->setPaddingNd(nvinfer1::DimsHW{p, p}); nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid}); return shuffle2; } nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network, std::vector dets, bool is_segmentation) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); nvinfer1::PluginField plugin_fields[1]; int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, is_segmentation}; plugin_fields[0].data = netinfo; plugin_fields[0].length = 5; plugin_fields[0].name = "netinfo"; plugin_fields[0].type = nvinfer1::PluginFieldType::kFLOAT32; nvinfer1::PluginFieldCollection plugin_data; plugin_data.nbFields = 1; plugin_data.fields = plugin_fields; nvinfer1::IPluginV2* plugin_obj = creator->createPlugin("yololayer", &plugin_data); std::vector input_tensors; for (auto det : dets) { input_tensors.push_back(det->getOutput(0)); } auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); return yolo; } std::vector DualDDetect(INetworkDefinition* network, std::map& weightMap, std::vector dets, int cls, std::vector ch, std::string lname) { int c2 = std::max(int(ch[0] / 4), int(16 * 4)); int c3 = std::max(ch[0], std::min(cls * 2, 128)); int reg_max = 16; std::vector bboxlayers; std::vector clslayers; for (int i = 0; i < dets.size(); i++) { // Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4) bboxlayers.push_back(DetectBbox_Conv(network, weightMap, *dets[i]->getOutput(0), c2, reg_max, lname + ".cv2." + std::to_string(i))); // Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, self.nc, 1) auto cls_layer = DetectCls_Conv(network, weightMap, *dets[i]->getOutput(0), c3, cls, lname + ".cv3." + std::to_string(i)); auto dim = cls_layer->getOutput(0)->getDimensions(); nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*cls_layer->getOutput(0)); shuffle->setReshapeDimensions(nvinfer1::Dims2{kNumClass, dim.d[1] * dim.d[2]}); clslayers.push_back(shuffle); } std::vector ret; for (int i = 0; i < dets.size(); i++) { // softmax 16*4, w, h => 16, 4, w, h auto loc = DFL(network, weightMap, *bboxlayers[i]->getOutput(0), 16, 1, 1, 0, lname + ".dfl"); nvinfer1::ITensor* inputTensor[] = {loc->getOutput(0), clslayers[i]->getOutput(0)}; ret.push_back(network->addConcatenation(inputTensor, 2)); } return ret; } std::vector DDetect(INetworkDefinition* network, std::map& weightMap, std::vector dets, int cls, std::vector ch, std::string lname) { int c2 = std::max(int(ch[0] / 4), int(16 * 4)); // max((ch[0], min((self.nc * 2, 128)))) // int c3 = std::max(ch[0], std::min(cls * 2, 128)); int c3 = std::max(ch[0], std::min(cls, 128)); int reg_max = 16; std::vector bboxlayers; std::vector clslayers; for (int i = 0; i < dets.size(); i++) { // Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4) bboxlayers.push_back(DetectBbox_Conv(network, weightMap, *dets[i]->getOutput(0), c2, reg_max, lname + ".cv2." + std::to_string(i))); // Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, self.nc, 1) auto cls_layer = DetectCls_Conv(network, weightMap, *dets[i]->getOutput(0), c3, cls, lname + ".cv3." + std::to_string(i)); auto dim = cls_layer->getOutput(0)->getDimensions(); nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*cls_layer->getOutput(0)); shuffle->setReshapeDimensions(nvinfer1::Dims2{kNumClass, dim.d[1] * dim.d[2]}); clslayers.push_back(shuffle); } std::vector ret; for (int i = 0; i < dets.size(); i++) { // softmax 16*4, w, h => 16, 4, w, h auto loc = DFL(network, weightMap, *bboxlayers[i]->getOutput(0), 16, 1, 1, 0, lname + ".dfl"); nvinfer1::ITensor* inputTensor[] = {loc->getOutput(0), clslayers[i]->getOutput(0)}; ret.push_back(network->addConcatenation(inputTensor, 2)); } return ret; } ================================================ FILE: yolov9/src/calibrator.cpp ================================================ #include "calibrator.h" #include "cuda_utils.h" #include "utils.h" #include #include #include #include #include static cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { int w, h, x, y; float r_w = input_w / (img.cols * 1.0); float r_h = input_h / (img.rows * 1.0); if (r_h > r_w) { w = input_w; h = r_w * img.rows; x = 0; y = (input_h - h) / 2; } else { w = r_h * img.cols; h = input_h; x = (input_w - w) / 2; y = 0; } cv::Mat re(h, w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); return out; } Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) : batchsize_(batchsize), input_w_(input_w), input_h_(input_h), img_idx_(0), img_dir_(img_dir), calib_table_name_(calib_table_name), input_blob_name_(input_blob_name), read_cache_(read_cache) { input_count_ = 3 * input_w * input_h * batchsize; CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); read_files_in_dir(img_dir, img_files_); } Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { CUDA_CHECK(cudaFree(device_input_)); } int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT { return batchsize_; } bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT { if (img_idx_ + batchsize_ > (int)img_files_.size()) { return false; } std::vector input_imgs_; for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { std::cout << img_files_[i] << " " << i << std::endl; cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); if (temp.empty()) { std::cerr << "Fatal error: image cannot open!" << std::endl; return false; } cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); input_imgs_.push_back(pr_img); } img_idx_ += batchsize_; cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); assert(!strcmp(names[0], input_blob_name_)); bindings[0] = device_input_; return true; } const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT { std::cout << "reading calib cache: " << calib_table_name_ << std::endl; calib_cache_.clear(); std::ifstream input(calib_table_name_, std::ios::binary); input >> std::noskipws; if (read_cache_ && input.good()) { std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); } length = calib_cache_.size(); return length ? calib_cache_.data() : nullptr; } void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT { std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; std::ofstream output(calib_table_name_, std::ios::binary); output.write(reinterpret_cast(cache), length); } ================================================ FILE: yolov9/src/model.cpp ================================================ #include "model.h" #include #include #include #include #include #include #include "block.h" #include "calibrator.h" #include "config.h" #include "yololayer.h" using namespace nvinfer1; #ifdef USE_INT8 void Calibrator(IBuilder* builder, IBuilderConfig* config) { std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(BuilderFlag::kINT8); Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); } #endif IHostMemory* build_engine_yolov9_t(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name, bool isConvert) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); // # conv down auto conv_1 = convBnSiLU(network, weightMap, *data, 16, 3, 2, 1, "model.0", 1); // # conv down auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 32, 3, 2, 1, "model.1"); // # elan-1 block auto repncspelan_3 = ELAN1(network, weightMap, *conv_2->getOutput(0), 32, 32, 32, 16, "model.2"); // # avg-conv down // [-1, 1, ADown, [256]], # 4-P3/8 auto adown_4 = AConv(network, weightMap, *repncspelan_3->getOutput(0), 64, "model.3"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 5 auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 64, 64, 64, 32, 3, "model.4"); // # avg-conv down // [-1, 1, ADown, [512]], # 6-P4/16 auto adown_6 = AConv(network, weightMap, *repncspelan_5->getOutput(0), 96, "model.5"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 7 auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 96, 96, 96, 48, 3, "model.6"); // # avg-conv down // [-1, 1, ADown, [512]], # 8-P5/32 auto adown_8 = AConv(network, weightMap, *repncspelan_7->getOutput(0), 128, "model.7"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 9 auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 128, 128, 128, 64, 3, "model.8"); // # elan-spp block // [-1, 1, SPPELAN, [512, 256]], # 10 auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 128, 128, 64, "model.9"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_11 = network->addResize(*sppelan_10->getOutput(0)); upsample_11->setResizeMode(ResizeMode::kNEAREST); const float scales_11[] = {1.0, 2.0, 2.0}; upsample_11->setScales(scales_11, 3); // [[-1, 7], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_12 = network->addConcatenation(input_tensor_12, 2); // # elan-2 block auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 288, 96, 96, 48, 3, "model.12"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0)); upsample_14->setResizeMode(ResizeMode::kNEAREST); const float scales_14[] = {1.0, 2.0, 2.0}; upsample_14->setScales(scales_14, 3); // [[-1, 5], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_15 = network->addConcatenation(input_tensor_15, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]], # 16 (P3/8-small) auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 192, 64, 64, 32, 3, "model.15"); // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_17 = AConv(network, weightMap, *repncspelan_16->getOutput(0), 48, "model.16"); // [[-1, 13], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)}; auto cat_18 = network->addConcatenation(input_tensor_18, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 19 (P4/16-medium) auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 144, 96, 96, 48, 3, "model.18"); // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_20 = AConv(network, weightMap, *repncspelan_19->getOutput(0), 64, "model.19"); // [[-1, 10], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)}; auto cat_21 = network->addConcatenation(input_tensor_21, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 22 (P5/32-large) auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 256, 128, 128, 64, 3, "model.21"); std::vector head; if (!isConvert) { // # elan-spp block auto sppelan_23 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 128, 64, "model.22"); // # up-concat merge auto upsample_24 = network->addResize(*sppelan_23->getOutput(0)); upsample_24->setResizeMode(ResizeMode::kNEAREST); const float scales_24[] = {1.0, 2.0, 2.0}; upsample_24->setScales(scales_24, 3); // [[-1, 6], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_25[] = {upsample_24->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_25 = network->addConcatenation(input_tensor_25, 2); // # elan-2 block auto repncspelan_26 = RepNCSPELAN4(network, weightMap, *cat_25->getOutput(0), 384, 96, 96, 48, 3, "model.25"); // # up-concat merge auto upsample_27 = network->addResize(*repncspelan_26->getOutput(0)); upsample_27->setResizeMode(ResizeMode::kNEAREST); const float scales_27[] = {1.0, 2.0, 2.0}; upsample_27->setScales(scales_27, 3); // [[-1, 4], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_28[] = {upsample_27->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_28 = network->addConcatenation(input_tensor_28, 2); // # elan-2 block auto repncspelan_29 = RepNCSPELAN4(network, weightMap, *cat_28->getOutput(0), 256, 64, 64, 32, 3, "model.28"); head = DualDDetect(network, weightMap, std::vector{repncspelan_16, repncspelan_19, repncspelan_22}, kNumClass, {64, 96, 128}, "model.29"); } else { head = DDetect(network, weightMap, std::vector{repncspelan_16, repncspelan_19, repncspelan_22}, kNumClass, {64, 96, 128}, "model.22"); } nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, head, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov9_s(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name, bool isConvert) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); // # conv down auto conv_1 = convBnSiLU(network, weightMap, *data, 32, 3, 2, 1, "model.0", 1); // # conv down auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 64, 3, 2, 1, "model.1"); // # elan-1 block auto repncspelan_3 = ELAN1(network, weightMap, *conv_2->getOutput(0), 32, 64, 64, 32, "model.2"); // # avg-conv down auto adown_4 = AConv(network, weightMap, *repncspelan_3->getOutput(0), 128, "model.3"); // # elan-2 block auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 128, 128, 128, 64, 3, "model.4"); // # avg-conv down auto adown_6 = AConv(network, weightMap, *repncspelan_5->getOutput(0), 192, "model.5"); // # elan-2 block auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 192, 192, 192, 96, 3, "model.6"); // # avg-conv down auto adown_8 = AConv(network, weightMap, *repncspelan_7->getOutput(0), 256, "model.7"); // # elan-2 block auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 256, 256, 256, 128, 3, "model.8"); // # elan-spp block auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 256, 128, "model.9"); // # up-concat merge auto upsample_11 = network->addResize(*sppelan_10->getOutput(0)); upsample_11->setResizeMode(ResizeMode::kNEAREST); const float scales_11[] = {1.0, 2.0, 2.0}; upsample_11->setScales(scales_11, 3); // [[-1, 7], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_12 = network->addConcatenation(input_tensor_12, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 13 auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 192, 192, 192, 96, 3, "model.12"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0)); upsample_14->setResizeMode(ResizeMode::kNEAREST); const float scales_14[] = {1.0, 2.0, 2.0}; upsample_14->setScales(scales_14, 3); // [[-1, 5], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_15 = network->addConcatenation(input_tensor_15, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]], # 16 (P3/8-small) auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 128, 128, 128, 64, 3, "model.15"); // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_17 = AConv(network, weightMap, *repncspelan_16->getOutput(0), 96, "model.16"); // [[-1, 13], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)}; auto cat_18 = network->addConcatenation(input_tensor_18, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 19 (P4/16-medium) auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 192, 192, 96, 3, "model.18"); // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_20 = AConv(network, weightMap, *repncspelan_19->getOutput(0), 128, "model.19"); // [[-1, 10], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)}; auto cat_21 = network->addConcatenation(input_tensor_21, 2); // # elan-2 block auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 256, 256, 128, 1, "model.21"); std::vector head; if (!isConvert) { // # elan-spp block auto sppelan_23 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 256, 128, "model.22"); // # up-concat merge auto upsample_24 = network->addResize(*sppelan_23->getOutput(0)); upsample_24->setResizeMode(ResizeMode::kNEAREST); const float scales_24[] = {1.0, 2.0, 2.0}; upsample_24->setScales(scales_24, 3); // [[-1, 6], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_25[] = {upsample_24->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_25 = network->addConcatenation(input_tensor_25, 2); // # elan-2 block auto repncspelan_26 = RepNCSPELAN4(network, weightMap, *cat_25->getOutput(0), 384, 192, 192, 96, 3, "model.25"); // # up-concat merge auto upsample_27 = network->addResize(*repncspelan_26->getOutput(0)); upsample_27->setResizeMode(ResizeMode::kNEAREST); const float scales_27[] = {1.0, 2.0, 2.0}; upsample_27->setScales(scales_27, 3); // [[-1, 4], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_28[] = {upsample_27->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_28 = network->addConcatenation(input_tensor_28, 2); // # elan-2 block auto repncspelan_29 = RepNCSPELAN4(network, weightMap, *cat_28->getOutput(0), 256, 128, 128, 64, 3, "model.28"); head = DualDDetect(network, weightMap, std::vector{repncspelan_16, repncspelan_19, repncspelan_22}, kNumClass, {128, 192, 256}, "model.29"); } else { head = DDetect(network, weightMap, std::vector{repncspelan_16, repncspelan_19, repncspelan_22}, kNumClass, {128, 192, 256}, "model.22"); } nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, head, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov9_m(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name, bool isConvert) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); int begin = isConvert ? 0 : 1; // # conv down // [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 auto conv_1 = convBnSiLU(network, weightMap, *data, 32, 3, 2, 1, "model." + std::to_string(begin), 1); begin += 1; // # conv down // [-1, 1, Conv, [128, 3, 2]], # 2-P2/4 auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 64, 3, 2, 1, "model." + std::to_string(begin)); begin += 1; // # elan-1 block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 3 auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 128, 128, 64, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv down // [-1, 1, ADown, [256]], # 4-P3/8 auto adown_4 = AConv(network, weightMap, *repncspelan_3->getOutput(0), 240, "model." + std::to_string(begin)); begin += 1; // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 5 auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 240, 240, 120, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv down // [-1, 1, ADown, [512]], # 6-P4/16 auto adown_6 = AConv(network, weightMap, *repncspelan_5->getOutput(0), 360, "model." + std::to_string(begin)); begin += 1; // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 7 auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 360, 360, 180, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv down // [-1, 1, ADown, [512]], # 8-P5/32 auto adown_8 = AConv(network, weightMap, *repncspelan_7->getOutput(0), 480, "model." + std::to_string(begin)); begin += 1; // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 9 auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 480, 480, 240, 1, "model." + std::to_string(begin)); begin += 1; // # elan-spp block // [-1, 1, SPPELAN, [512, 256]], # 10 auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 480, 240, "model." + std::to_string(begin)); begin += 3; // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_11 = network->addResize(*sppelan_10->getOutput(0)); upsample_11->setResizeMode(ResizeMode::kNEAREST); const float scales_11[] = {1.0, 2.0, 2.0}; upsample_11->setScales(scales_11, 3); // [[-1, 7], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_12 = network->addConcatenation(input_tensor_12, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 13 auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 1536, 360, 360, 180, 1, "model." + std::to_string(begin)); begin += 3; // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0)); upsample_14->setResizeMode(ResizeMode::kNEAREST); const float scales_14[] = {1.0, 2.0, 2.0}; upsample_14->setScales(scales_14, 3); // [[-1, 5], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_15 = network->addConcatenation(input_tensor_15, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]], # 16 (P3/8-small) auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 1024, 240, 240, 120, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_17 = AConv(network, weightMap, *repncspelan_16->getOutput(0), 184, "model." + std::to_string(begin)); begin += 2; // [[-1, 13], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)}; auto cat_18 = network->addConcatenation(input_tensor_18, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 19 (P4/16-medium) auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 360, 360, 180, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_20 = AConv(network, weightMap, *repncspelan_19->getOutput(0), 240, "model." + std::to_string(begin)); begin += 2; // [[-1, 10], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)}; auto cat_21 = network->addConcatenation(input_tensor_21, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 22 (P5/32-large) auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 480, 480, 240, 1, "model." + std::to_string(begin)); begin += 1; std::vector head; if (!isConvert) { // # routing // [5, 1, CBLinear, [[256]]], # 23 auto cblinear_23 = CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {240}, 1, 1, 0, 1, "model." + std::to_string(begin)); begin += 1; // [7, 1, CBLinear, [[256, 512]]], # 24 auto cblinear_24 = CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {240, 360}, 1, 1, 0, 1, "model." + std::to_string(begin)); begin += 1; // [9, 1, CBLinear, [[256, 512, 512]]], # 25 auto cblinear_25 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {240, 360, 480}, 1, 1, 0, 1, "model." + std::to_string(begin)); begin += 1; // # conv down // [0, 1, Conv, [64, 3, 2]], # 26-P1/2 auto conv_26 = convBnSiLU(network, weightMap, *data, 32, 3, 2, 1, "model." + std::to_string(begin), 1); begin += 1; // # conv down // [-1, 1, Conv, [128, 3, 2]], # 27-P2/4 auto conv_27 = convBnSiLU(network, weightMap, *conv_26->getOutput(0), 64, 3, 2, 1, "model." + std::to_string(begin)); begin += 1; // # elan-1 block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 28 auto repncspelan_28 = RepNCSPELAN4(network, weightMap, *conv_27->getOutput(0), 128, 128, 128, 64, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv down fuse // [-1, 1, ADown, [256]], # 29-P3/8 auto adown_29 = AConv(network, weightMap, *repncspelan_28->getOutput(0), 240, "model." + std::to_string(begin)); begin += 2; // [[23, 24, 25, -1], 1, CBFuse, [[0, 0, 0]]], # 30 auto cbfuse = CBFuse(network, {cblinear_23, cblinear_24, cblinear_25, std::vector{adown_29}}, {0, 0, 0, 0}, {8, 16, 32, 8}); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 31 auto repncspelan_31 = RepNCSPELAN4(network, weightMap, *cbfuse->getOutput(0), 256, 240, 240, 120, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv down fuse // [-1, 1, ADown, [512]], # 32-P4/16 auto adown_32 = AConv(network, weightMap, *repncspelan_31->getOutput(0), 360, "model." + std::to_string(begin)); begin += 2; // [[24, 25, -1], 1, CBFuse, [[1, 1]]], # 33 auto cbfuse_33 = CBFuse(network, {cblinear_24, cblinear_25, std::vector{adown_32}}, {1, 1, 0}, {16, 32, 16}); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 34 auto repncspelan_34 = RepNCSPELAN4(network, weightMap, *cbfuse_33->getOutput(0), 512, 360, 360, 180, 1, "model." + std::to_string(begin)); begin += 1; // # avg-conv down fuse // [-1, 1, ADown, [512]], # 35-P5/32 auto adown_35 = AConv(network, weightMap, *repncspelan_34->getOutput(0), 480, "model." + std::to_string(begin)); begin += 2; // [[25, -1], 1, CBFuse, [[2]]], # 36 auto cbfuse_36 = CBFuse(network, {cblinear_25, std::vector{adown_35}}, {2, 0}, {32, 32}); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 37 auto repncspelan_37 = RepNCSPELAN4(network, weightMap, *cbfuse_36->getOutput(0), 512, 480, 480, 240, 1, "model." + std::to_string(begin)); begin += 1; // # detection head // # detect // [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]], # DualDDetect(A3, A4, A5, P3, P4, P5) head = DualDDetect(network, weightMap, std::vector{repncspelan_31, repncspelan_34, repncspelan_37}, kNumClass, {240, 360, 480}, "model." + std::to_string(begin)); } else { // # detection head // # detect // [[16, 19, 22], 1, DDetect, [nc]], # DDetect(P3, P4, P5) head = DDetect(network, weightMap, std::vector{repncspelan_16, repncspelan_19, repncspelan_22}, kNumClass, {240, 360, 480}, "model." + std::to_string(begin)); } nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, head, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov9_c(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); // # conv down // [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.1", 1); // # conv down // [-1, 1, Conv, [128, 3, 2]], # 2-P2/4 auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.2"); // # elan-1 block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 3 auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 1, "model.3"); // # avg-conv down // [-1, 1, ADown, [256]], # 4-P3/8 auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.4"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 5 auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 1, "model.5"); // # avg-conv down // [-1, 1, ADown, [512]], # 6-P4/16 auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.6"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 7 auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 512, 512, 256, 1, "model.7"); // # avg-conv down // [-1, 1, ADown, [512]], # 8-P5/32 auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 512, "model.8"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 9 auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 512, 512, 256, 1, "model.9"); // # elan-spp block // [-1, 1, SPPELAN, [512, 256]], # 10 auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 512, 256, "model.10"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_11 = network->addResize(*sppelan_10->getOutput(0)); upsample_11->setResizeMode(ResizeMode::kNEAREST); const float scales_11[] = {1.0, 2.0, 2.0}; upsample_11->setScales(scales_11, 3); // [[-1, 7], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_12 = network->addConcatenation(input_tensor_12, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 13 auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 1536, 512, 512, 256, 1, "model.13"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0)); upsample_14->setResizeMode(ResizeMode::kNEAREST); const float scales_14[] = {1.0, 2.0, 2.0}; upsample_14->setScales(scales_14, 3); // [[-1, 5], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_15 = network->addConcatenation(input_tensor_15, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]], # 16 (P3/8-small) auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 1024, 256, 256, 128, 1, "model.16"); // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_17 = ADown(network, weightMap, *repncspelan_16->getOutput(0), 256, "model.17"); // [[-1, 13], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)}; auto cat_18 = network->addConcatenation(input_tensor_18, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 19 (P4/16-medium) auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 512, 512, 256, 1, "model.19"); // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 512, "model.20"); // [[-1, 10], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)}; auto cat_21 = network->addConcatenation(input_tensor_21, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 22 (P5/32-large) auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 512, 512, 256, 1, "model.22"); // # multi-level reversible auxiliary branch // # routing // [5, 1, CBLinear, [[256]]], # 23 auto cblinear_23 = CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {256}, 1, 1, 0, 1, "model.23"); // [7, 1, CBLinear, [[256, 512]]], # 24 auto cblinear_24 = CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {256, 512}, 1, 1, 0, 1, "model.24"); // [9, 1, CBLinear, [[256, 512, 512]]], # 25 auto cblinear_25 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {256, 512, 512}, 1, 1, 0, 1, "model.25"); // # conv down // [0, 1, Conv, [64, 3, 2]], # 26-P1/2 auto conv_26 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.26", 1); // # conv down // [-1, 1, Conv, [128, 3, 2]], # 27-P2/4 auto conv_27 = convBnSiLU(network, weightMap, *conv_26->getOutput(0), 128, 3, 2, 1, "model.27"); // # elan-1 block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 28 auto repncspelan_28 = RepNCSPELAN4(network, weightMap, *conv_27->getOutput(0), 128, 256, 128, 64, 1, "model.28"); // # avg-conv down fuse // [-1, 1, ADown, [256]], # 29-P3/8 auto adown_29 = ADown(network, weightMap, *repncspelan_28->getOutput(0), 256, "model.29"); // [[23, 24, 25, -1], 1, CBFuse, [[0, 0, 0]]], # 30 auto cbfuse = CBFuse(network, {cblinear_23, cblinear_24, cblinear_25, std::vector{adown_29}}, {0, 0, 0, 0}, {8, 16, 32, 8}); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 31 auto repncspelan_31 = RepNCSPELAN4(network, weightMap, *cbfuse->getOutput(0), 256, 512, 256, 128, 1, "model.31"); // # avg-conv down fuse // [-1, 1, ADown, [512]], # 32-P4/16 auto adown_32 = ADown(network, weightMap, *repncspelan_31->getOutput(0), 512, "model.32"); // [[24, 25, -1], 1, CBFuse, [[1, 1]]], # 33 auto cbfuse_33 = CBFuse(network, {cblinear_24, cblinear_25, std::vector{adown_32}}, {1, 1, 0}, {16, 32, 16}); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 34 auto repncspelan_34 = RepNCSPELAN4(network, weightMap, *cbfuse_33->getOutput(0), 512, 512, 512, 256, 1, "model.34"); // # avg-conv down fuse // [-1, 1, ADown, [512]], # 35-P5/32 auto adown_35 = ADown(network, weightMap, *repncspelan_34->getOutput(0), 512, "model.35"); // [[25, -1], 1, CBFuse, [[2]]], # 36 auto cbfuse_36 = CBFuse(network, {cblinear_25, std::vector{adown_35}}, {2, 0}, {32, 32}); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 37 auto repncspelan_37 = RepNCSPELAN4(network, weightMap, *cbfuse_36->getOutput(0), 512, 512, 512, 256, 1, "model.37"); // # detection head // # detect // [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]], # DualDDetect(A3, A4, A5, P3, P4, P5) auto dualddetect_38 = DualDDetect(network, weightMap, std::vector{repncspelan_31, repncspelan_34, repncspelan_37}, kNumClass, {512, 512, 512}, "model.38"); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, dualddetect_38, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_yolov9_e(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); /* ------backbone------ */ // [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.1", 1); assert(conv_1); // [-1, 1, Conv, [128, 3, 2]], # 2-P2/4 auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.2"); // csp-elan block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]], # 3 auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 2, "model.3"); // avg-conv down // [-1, 1, ADown, [256]], # 4-P3/8 auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.4"); // csp-elan block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]], # 5 auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 2, "model.5"); // avg-conv down // [-1, 1, ADown, [512]], # 6-P4/16 auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.6"); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 7 auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 1024, 512, 256, 2, "model.7"); // avg-conv down // [-1, 1, ADown, [1024]], # 8-P5/32 auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 1024, "model.8"); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 9 auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 1024, 512, 256, 2, "model.9"); // [1, 1, CBLinear, [[64]]], # 10 auto cblinear_10 = CBLinear(network, weightMap, *conv_1->getOutput(0), {64}, 1, 1, 0, 1, "model.10"); // [3, 1, CBLinear, [[64, 128]]], # 11 auto cblinear_11 = CBLinear(network, weightMap, *repncspelan_3->getOutput(0), {64, 128}, 1, 1, 0, 1, "model.11"); // [5, 1, CBLinear, [[64, 128, 256]]], # 12 auto cblinear_12 = CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {64, 128, 256}, 1, 1, 0, 1, "model.12"); // [7, 1, CBLinear, [[64, 128, 256, 512]]], # 13 auto cblinear_13 = CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {64, 128, 256, 512}, 1, 1, 0, 1, "model.13"); // [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]], # 14 auto cblinear_14 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {64, 128, 256, 512, 1024}, 1, 1, 0, 1, "model.14"); // conv down // [0, 1, Conv, [64, 3, 2]], # 15-P1/2 auto conv_15 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.15", 1); // [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]], # 16 auto cbfuse_16 = CBFuse( network, {cblinear_10, cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector{conv_15}}, {0, 0, 0, 0, 0, 0}, {2, 4, 8, 16, 32, 2}); // conv down // [-1, 1, Conv, [128, 3, 2]], # 17-P2/4 auto conv_17 = convBnSiLU(network, weightMap, *cbfuse_16->getOutput(0), 128, 3, 2, 1, "model.17"); // [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]], # 18 auto cbfuse_18 = CBFuse(network, {cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector{conv_17}}, {1, 1, 1, 1, 0}, {4, 8, 16, 32, 4}); // csp-elan block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]], # 19 auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cbfuse_18->getOutput(0), 128, 256, 128, 64, 2, "model.19"); // avg-conv down fuse // [-1, 1, ADown, [256]], # 20-P3/8 auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 256, "model.20"); // [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]], # 21 auto cbfuse_21 = CBFuse(network, {cblinear_12, cblinear_13, cblinear_14, std::vector{adown_20}}, {2, 2, 2, 0}, {8, 16, 32, 8}); // csp-elan block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]], # 22 auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cbfuse_21->getOutput(0), 256, 512, 256, 128, 2, "model.22"); // avg-conv down fuse // [-1, 1, ADown, [512]], # 23-P4/16 auto adown_23 = ADown(network, weightMap, *repncspelan_22->getOutput(0), 512, "model.23"); // [[13, 14, -1], 1, CBFuse, [[3, 3]]], # 24 auto cbfuse_24 = CBFuse(network, {cblinear_13, cblinear_14, std::vector{adown_23}}, {3, 3, 0}, {16, 32, 16}); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 25 auto repncspelan_25 = RepNCSPELAN4(network, weightMap, *cbfuse_24->getOutput(0), 512, 1024, 512, 256, 2, "model.25"); // avg-conv down fuse // [-1, 1, ADown, [1024]], # 26-P5/32 auto adown_26 = ADown(network, weightMap, *repncspelan_25->getOutput(0), 1024, "model.26"); // [[14, -1], 1, CBFuse, [[4]]], # 27 auto cbfuse_27 = CBFuse(network, {cblinear_14, std::vector{adown_26}}, {4, 0}, {32, 32}); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 28 auto repncspelan_28 = RepNCSPELAN4(network, weightMap, *cbfuse_27->getOutput(0), 512, 1024, 512, 256, 2, "model.28"); // elan-spp block // [9, 1, SPPELAN, [512, 256]], # 29 auto sppelan_29 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 1024, 512, 256, "model.29"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_30 = network->addResize(*sppelan_29->getOutput(0)); upsample_30->setResizeMode(ResizeMode::kNEAREST); const float scales_30[] = {1.0, 2.0, 2.0}; upsample_30->setScales(scales_30, 3); // [[-1, 7], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_31[] = {upsample_30->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_31 = network->addConcatenation(input_tensor_31, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]], # 32 auto repncspelan_32 = RepNCSPELAN4(network, weightMap, *cat_31->getOutput(0), 1536, 512, 512, 256, 2, "model.32"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_33 = network->addResize(*repncspelan_32->getOutput(0)); upsample_33->setResizeMode(ResizeMode::kNEAREST); const float scales_33[] = {1.0, 2.0, 2.0}; upsample_33->setScales(scales_33, 3); // [[-1, 5], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_34[] = {upsample_33->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_34 = network->addConcatenation(input_tensor_34, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]], # 35 auto repncspelan_35 = RepNCSPELAN4(network, weightMap, *cat_34->getOutput(0), 1024, 256, 256, 128, 2, "model.35"); // # elan-spp block // [28, 1, SPPELAN, [512, 256]], # 36 auto sppelan_36 = SPPELAN(network, weightMap, *repncspelan_28->getOutput(0), 1024, 512, 256, "model.36"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_37 = network->addResize(*sppelan_36->getOutput(0)); upsample_37->setResizeMode(ResizeMode::kNEAREST); const float scales_37[] = {1.0, 2.0, 2.0}; upsample_37->setScales(scales_37, 3); // [[-1, 25], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_38[] = {upsample_37->getOutput(0), repncspelan_25->getOutput(0)}; auto cat_38 = network->addConcatenation(input_tensor_38, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]], # 39 auto repncspelan_39 = RepNCSPELAN4(network, weightMap, *cat_38->getOutput(0), 1536, 512, 512, 256, 2, "model.39"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_40 = network->addResize(*repncspelan_39->getOutput(0)); upsample_40->setResizeMode(ResizeMode::kNEAREST); const float scales_40[] = {1.0, 2.0, 2.0}; upsample_40->setScales(scales_40, 3); // [[-1, 22], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_41[] = {upsample_40->getOutput(0), repncspelan_22->getOutput(0)}; auto cat_41 = network->addConcatenation(input_tensor_41, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]], # 42 (P3/8-small) auto repncspelan_42 = RepNCSPELAN4(network, weightMap, *cat_41->getOutput(0), 1024, 256, 256, 128, 2, "model.42"); // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_43 = ADown(network, weightMap, *repncspelan_42->getOutput(0), 256, "model.43"); // [[-1, 39], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_44[] = {adown_43->getOutput(0), repncspelan_39->getOutput(0)}; auto cat_44 = network->addConcatenation(input_tensor_44, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]], # 45 (P4/16-medium) auto repncspelan_45 = RepNCSPELAN4(network, weightMap, *cat_44->getOutput(0), 768, 512, 512, 256, 2, "model.45"); // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_46 = ADown(network, weightMap, *repncspelan_45->getOutput(0), 512, "model.46"); // [[-1, 36], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_47[] = {adown_46->getOutput(0), sppelan_36->getOutput(0)}; auto cat_47 = network->addConcatenation(input_tensor_47, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]], # 48 (P5/32-large) auto repncspelan_48 = RepNCSPELAN4(network, weightMap, *cat_47->getOutput(0), 1024, 512, 1024, 512, 2, "model.48"); // auto DualDDetect_49 = DualDDetect(network, weightMap, std::vector{RepNCSPELAN_42, RepNCSPELAN_45, RepNCSPELAN_48}, kNumClass, {256, 512, 512}, "model.49"); auto dualddetect_49 = DualDDetect(network, weightMap, std::vector{repncspelan_35, repncspelan_32, sppelan_29}, kNumClass, {256, 512, 512}, "model.49"); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, dualddetect_49, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_gelan_e(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); /* ------backbone------ */ // [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.1", 1); assert(conv_1); // [-1, 1, Conv, [128, 3, 2]], # 2-P2/4 auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.2"); // csp-elan block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]], # 3 auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 2, "model.3"); // avg-conv down // [-1, 1, ADown, [256]], # 4-P3/8 auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.4"); // csp-elan block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]], # 5 auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 2, "model.5"); // avg-conv down // [-1, 1, ADown, [512]], # 6-P4/16 auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.6"); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 7 auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 1024, 512, 256, 2, "model.7"); // avg-conv down // [-1, 1, ADown, [1024]], # 8-P5/32 auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 1024, "model.8"); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 9 auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 1024, 512, 256, 2, "model.9"); // [1, 1, CBLinear, [[64]]], # 10 auto cblinear_10 = CBLinear(network, weightMap, *conv_1->getOutput(0), {64}, 1, 1, 0, 1, "model.10"); // [3, 1, CBLinear, [[64, 128]]], # 11 auto cblinear_11 = CBLinear(network, weightMap, *repncspelan_3->getOutput(0), {64, 128}, 1, 1, 0, 1, "model.11"); // [5, 1, CBLinear, [[64, 128, 256]]], # 12 auto cblinear_12 = CBLinear(network, weightMap, *repncspelan_5->getOutput(0), {64, 128, 256}, 1, 1, 0, 1, "model.12"); // [7, 1, CBLinear, [[64, 128, 256, 512]]], # 13 auto cblinear_13 = CBLinear(network, weightMap, *repncspelan_7->getOutput(0), {64, 128, 256, 512}, 1, 1, 0, 1, "model.13"); // [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]], # 14 auto cblinear_14 = CBLinear(network, weightMap, *repncspelan_9->getOutput(0), {64, 128, 256, 512, 1024}, 1, 1, 0, 1, "model.14"); // conv down // [0, 1, Conv, [64, 3, 2]], # 15-P1/2 auto conv_15 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.15", 1); // [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]], # 16 auto cbfuse_16 = CBFuse( network, {cblinear_10, cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector{conv_15}}, {0, 0, 0, 0, 0, 0}, {2, 4, 8, 16, 32, 2}); // conv down // [-1, 1, Conv, [128, 3, 2]], # 17-P2/4 auto conv_17 = convBnSiLU(network, weightMap, *cbfuse_16->getOutput(0), 128, 3, 2, 1, "model.17"); // [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]], # 18 auto cbfuse_18 = CBFuse(network, {cblinear_11, cblinear_12, cblinear_13, cblinear_14, std::vector{conv_17}}, {1, 1, 1, 1, 0}, {4, 8, 16, 32, 4}); // csp-elan block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]], # 19 auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cbfuse_18->getOutput(0), 128, 256, 128, 64, 2, "model.19"); // avg-conv down fuse // [-1, 1, ADown, [256]], # 20-P3/8 auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 256, "model.20"); // [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]], # 21 auto cbfuse_21 = CBFuse(network, {cblinear_12, cblinear_13, cblinear_14, std::vector{adown_20}}, {2, 2, 2, 0}, {8, 16, 32, 8}); // csp-elan block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]], # 22 auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cbfuse_21->getOutput(0), 256, 512, 256, 128, 2, "model.22"); // avg-conv down fuse // [-1, 1, ADown, [512]], # 23-P4/16 auto adown_23 = ADown(network, weightMap, *repncspelan_22->getOutput(0), 512, "model.23"); // [[13, 14, -1], 1, CBFuse, [[3, 3]]], # 24 auto cbfuse_24 = CBFuse(network, {cblinear_13, cblinear_14, std::vector{adown_23}}, {3, 3, 0}, {16, 32, 16}); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 25 auto repncspelan_25 = RepNCSPELAN4(network, weightMap, *cbfuse_24->getOutput(0), 512, 1024, 512, 256, 2, "model.25"); // avg-conv down fuse // [-1, 1, ADown, [1024]], # 26-P5/32 auto adown_26 = ADown(network, weightMap, *repncspelan_25->getOutput(0), 1024, "model.26"); // [[14, -1], 1, CBFuse, [[4]]], # 27 auto cbfuse_27 = CBFuse(network, {cblinear_14, std::vector{adown_26}}, {4, 0}, {32, 32}); // csp-elan block // [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]], # 28 auto repncspelan_28 = RepNCSPELAN4(network, weightMap, *cbfuse_27->getOutput(0), 512, 1024, 512, 256, 2, "model.28"); // elan-spp block // [28, 1, SPPELAN, [512, 256]], # 29 auto sppelan_29 = SPPELAN(network, weightMap, *repncspelan_28->getOutput(0), 1024, 512, 256, "model.29"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_30 = network->addResize(*sppelan_29->getOutput(0)); upsample_30->setResizeMode(ResizeMode::kNEAREST); const float scales_30[] = {1.0, 2.0, 2.0}; upsample_30->setScales(scales_30, 3); // [[-1, 25], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_31[] = {upsample_30->getOutput(0), repncspelan_25->getOutput(0)}; auto cat_31 = network->addConcatenation(input_tensor_31, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]], # 32 auto repncspelan_32 = RepNCSPELAN4(network, weightMap, *cat_31->getOutput(0), 1536, 512, 512, 256, 2, "model.32"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_33 = network->addResize(*repncspelan_32->getOutput(0)); upsample_33->setResizeMode(ResizeMode::kNEAREST); const float scales_33[] = {1.0, 2.0, 2.0}; upsample_33->setScales(scales_33, 3); // [[-1, 22], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_34[] = {upsample_33->getOutput(0), repncspelan_22->getOutput(0)}; auto cat_34 = network->addConcatenation(input_tensor_34, 2); // # csp-elan block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]], # 35 auto repncspelan_35 = RepNCSPELAN4(network, weightMap, *cat_34->getOutput(0), 1024, 256, 256, 128, 2, "model.35"); // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_36 = ADown(network, weightMap, *repncspelan_35->getOutput(0), 256, "model.36"); // [[-1, 32], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_37[] = {adown_36->getOutput(0), repncspelan_32->getOutput(0)}; auto cat_37 = network->addConcatenation(input_tensor_37, 2); // [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]], # 38 (P4/16-medium) auto repncspelan_38 = RepNCSPELAN4(network, weightMap, *cat_37->getOutput(0), 768, 512, 512, 256, 2, "model.38"); // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_39 = ADown(network, weightMap, *repncspelan_38->getOutput(0), 512, "model.39"); // [[-1, 29], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_40[] = {adown_39->getOutput(0), sppelan_29->getOutput(0)}; auto cat_40 = network->addConcatenation(input_tensor_40, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]], # 41 (P5/32-large) auto repncspelan_41 = RepNCSPELAN4(network, weightMap, *cat_40->getOutput(0), 1024, 512, 1024, 512, 2, "model.41"); auto ddetect_42 = DDetect(network, weightMap, std::vector{repncspelan_35, repncspelan_38, repncspelan_41}, kNumClass, {256, 512, 512}, "model.42"); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, ddetect_42, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } IHostMemory* build_engine_gelan_c(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) { /* ------ Create the builder ------ */ INetworkDefinition* network = builder->createNetworkV2(0U); ITensor* data = network->addInput(kInputTensorName, dt, Dims3{3, kInputH, kInputW}); assert(data); std::map weightMap = loadWeights(wts_name); // # conv down // [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 auto conv_1 = convBnSiLU(network, weightMap, *data, 64, 3, 2, 1, "model.0", 1); // # conv down // [-1, 1, Conv, [128, 3, 2]], # 2-P2/4 auto conv_2 = convBnSiLU(network, weightMap, *conv_1->getOutput(0), 128, 3, 2, 1, "model.1"); // # elan-1 block // [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]], # 3 auto repncspelan_3 = RepNCSPELAN4(network, weightMap, *conv_2->getOutput(0), 128, 256, 128, 64, 1, "model.2"); // # avg-conv down // [-1, 1, ADown, [256]], # 4-P3/8 auto adown_4 = ADown(network, weightMap, *repncspelan_3->getOutput(0), 256, "model.3"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]], # 5 auto repncspelan_5 = RepNCSPELAN4(network, weightMap, *adown_4->getOutput(0), 256, 512, 256, 128, 1, "model.4"); // # avg-conv down // [-1, 1, ADown, [512]], # 6-P4/16 auto adown_6 = ADown(network, weightMap, *repncspelan_5->getOutput(0), 512, "model.5"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 7 auto repncspelan_7 = RepNCSPELAN4(network, weightMap, *adown_6->getOutput(0), 512, 512, 512, 256, 1, "model.6"); // # avg-conv down // [-1, 1, ADown, [512]], # 8-P5/32 auto adown_8 = ADown(network, weightMap, *repncspelan_7->getOutput(0), 512, "model.7"); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 9 auto repncspelan_9 = RepNCSPELAN4(network, weightMap, *adown_8->getOutput(0), 512, 512, 512, 256, 1, "model.8"); // # elan-spp block // [-1, 1, SPPELAN, [512, 256]], # 10 auto sppelan_10 = SPPELAN(network, weightMap, *repncspelan_9->getOutput(0), 512, 512, 256, "model.9"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_11 = network->addResize(*sppelan_10->getOutput(0)); upsample_11->setResizeMode(ResizeMode::kNEAREST); const float scales_11[] = {1.0, 2.0, 2.0}; upsample_11->setScales(scales_11, 3); // [[-1, 7], 1, Concat, [1]], # cat backbone P4 ITensor* input_tensor_12[] = {upsample_11->getOutput(0), repncspelan_7->getOutput(0)}; auto cat_12 = network->addConcatenation(input_tensor_12, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 13 auto repncspelan_13 = RepNCSPELAN4(network, weightMap, *cat_12->getOutput(0), 1536, 512, 512, 256, 1, "model.12"); // # up-concat merge // [-1, 1, nn.Upsample, [None, 2, 'nearest']], auto upsample_14 = network->addResize(*repncspelan_13->getOutput(0)); upsample_14->setResizeMode(ResizeMode::kNEAREST); const float scales_14[] = {1.0, 2.0, 2.0}; upsample_14->setScales(scales_14, 3); // [[-1, 5], 1, Concat, [1]], # cat backbone P3 ITensor* input_tensor_15[] = {upsample_14->getOutput(0), repncspelan_5->getOutput(0)}; auto cat_15 = network->addConcatenation(input_tensor_15, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]], # 16 (P3/8-small) auto repncspelan_16 = RepNCSPELAN4(network, weightMap, *cat_15->getOutput(0), 1024, 256, 256, 128, 1, "model.15"); // # avg-conv-down merge // [-1, 1, ADown, [256]], auto adown_17 = ADown(network, weightMap, *repncspelan_16->getOutput(0), 256, "model.16"); // [[-1, 13], 1, Concat, [1]], # cat head P4 ITensor* input_tensor_18[] = {adown_17->getOutput(0), repncspelan_13->getOutput(0)}; auto cat_18 = network->addConcatenation(input_tensor_18, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 19 (P4/16-medium) auto repncspelan_19 = RepNCSPELAN4(network, weightMap, *cat_18->getOutput(0), 768, 512, 512, 256, 1, "model.18"); // # avg-conv-down merge // [-1, 1, ADown, [512]], auto adown_20 = ADown(network, weightMap, *repncspelan_19->getOutput(0), 512, "model.19"); // [[-1, 10], 1, Concat, [1]], # cat head P5 ITensor* input_tensor_21[] = {adown_20->getOutput(0), sppelan_10->getOutput(0)}; auto cat_21 = network->addConcatenation(input_tensor_21, 2); // # elan-2 block // [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]], # 22 (P5/32-large) auto repncspelan_22 = RepNCSPELAN4(network, weightMap, *cat_21->getOutput(0), 1024, 512, 512, 256, 1, "model.21"); // # detection head // # detect // [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]], # DualDDetect(A3, A4, A5, P3, P4, P5) auto ddetect_23 = DDetect(network, weightMap, std::vector{repncspelan_16, repncspelan_19, repncspelan_22}, kNumClass, {256, 512, 512}, "model.22"); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, ddetect_23, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); builder->setMaxBatchSize(kBatchSize); config->setMaxWorkspaceSize(16 * (1 << 20)); #if defined(USE_FP16) config->setFlag(nvinfer1::BuilderFlag::kFP16); #elif defined(USE_INT8) std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; assert(builder->platformHasFastInt8()); config->setFlag(nvinfer1::BuilderFlag::kINT8); auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, gCalibTablePath, "int8calib.table", kInputTensorName); config->setInt8Calibrator(calibrator); #endif std::cout << "Building engine, please wait for a while..." << std::endl; IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); std::cout << "Build engine successfully!" << std::endl; delete network; // Release host memory for (auto& mem : weightMap) { free((void*)(mem.second.values)); } return serialized_model; } ================================================ FILE: yolov9/src/postprocess.cpp ================================================ #include "postprocess.h" #include "utils.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]) { float l, r, t, b; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { l = bbox[0] - bbox[2] / 2.f; r = bbox[0] + bbox[2] / 2.f; t = bbox[1] - bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2; b = bbox[1] + bbox[3] / 2.f - (kInputH - r_w * img.rows) / 2; l = l / r_w; r = r / r_w; t = t / r_w; b = b / r_w; } else { l = bbox[0] - bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2; r = bbox[0] + bbox[2] / 2.f - (kInputW - r_h * img.cols) / 2; t = bbox[1] - bbox[3] / 2.f; b = bbox[1] + bbox[3] / 2.f; l = l / r_h; r = r / r_h; t = t / r_h; b = b / r_h; } return cv::Rect(round(l), round(t), round(r - l), round(b - t)); } static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0] - lbox[2] / 2.f, rbox[0] - rbox[2] / 2.f), //left (std::min)(lbox[0] + lbox[2] / 2.f, rbox[0] + rbox[2] / 2.f), //right (std::max)(lbox[1] - lbox[3] / 2.f, rbox[1] - rbox[3] / 2.f), //top (std::min)(lbox[1] + lbox[3] / 2.f, rbox[1] + rbox[3] / 2.f), //bottom }; if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) return 0.0f; float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); } static bool cmp(const Detection& a, const Detection& b) { return a.conf > b.conf; } void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { int det_size = sizeof(Detection) / sizeof(float); std::map> m; for (int i = 0; i < output[0] && i < kMaxNumOutputBbox; i++) { if (output[1 + det_size * i + 4] <= conf_thresh) continue; Detection det; memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); // x1x2y1y2 -> xywh float c_x = (det.bbox[0] + det.bbox[2]) / 2; float c_y = (det.bbox[1] + det.bbox[3]) / 2; float w = det.bbox[2] - det.bbox[0]; float h = det.bbox[3] - det.bbox[1]; det.bbox[0] = c_x; det.bbox[1] = c_y; det.bbox[2] = w; det.bbox[3] = h; m[det.class_id].push_back(det); } for (auto it = m.begin(); it != m.end(); it++) { auto& dets = it->second; std::sort(dets.begin(), dets.end(), cmp); for (size_t m = 0; m < dets.size(); ++m) { auto& item = dets[m]; res.push_back(item); for (size_t n = m + 1; n < dets.size(); ++n) { if (iou(item.bbox, dets[n].bbox) > nms_thresh) { dets.erase(dets.begin() + n); --n; } } } } } void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, float conf_thresh, float nms_thresh) { res_batch.resize(batch_size); for (int i = 0; i < batch_size; i++) { nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); } } void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { for (size_t i = 0; i < img_batch.size(); i++) { auto& res = res_batch[i]; cv::Mat img = img_batch[i]; for (size_t j = 0; j < res.size(); j++) { cv::Rect r = get_rect(img, res[j].bbox); cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } // draw num of objets to img for (size_t i = 0; i < img_batch.size(); i++) { cv::putText(img_batch[i], std::to_string(res_batch[i].size()), cv::Point(0, 20), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); } } static cv::Rect get_downscale_rect(float bbox[4], float scale) { float left = bbox[0] - bbox[2] / 2; float top = bbox[1] - bbox[3] / 2; float right = bbox[0] + bbox[2] / 2; float bottom = bbox[1] + bbox[3] / 2; left /= scale; top /= scale; right /= scale; bottom /= scale; return cv::Rect(round(left), round(top), round(right - left), round(bottom - top)); } // std::vector process_mask(const float* proto, int proto_size, std::vector& dets) { // std::vector masks; // for (size_t i = 0; i < dets.size(); i++) { // cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); // auto r = get_downscale_rect(dets[i].bbox, 4); // for (int x = r.x; x < r.x + r.width; x++) { // for (int y = r.y; y < r.y + r.height; y++) { // float e = 0.0f; // for (int j = 0; j < 32; j++) { // e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; // } // e = 1.0f / (1.0f + expf(-e)); // mask_mat.at(y, x) = e; // } // } // cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); // masks.push_back(mask_mat); // } // return masks; // } cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); float r_h = kInputH / (img.rows * 1.0); if (r_h > r_w) { w = kInputW; h = r_w * img.rows; x = 0; y = (kInputH - h) / 2; } else { w = r_h * img.cols; h = kInputH; x = (kInputW - w) / 2; y = 0; } cv::Rect r(x, y, w, h); cv::Mat res; cv::resize(mask(r), res, img.size()); return res; } void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map) { static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; for (size_t i = 0; i < dets.size(); i++) { cv::Mat img_mask = scale_mask(masks[i], img); auto color = colors[(int)dets[i].class_id % colors.size()]; auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); cv::Rect r = get_rect(img, dets[i].bbox); for (int x = r.x; x < r.x + r.width; x++) { for (int y = r.y; y < r.y + r.height; y++) { float val = img_mask.at(y, x); if (val <= 0.5) continue; img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; } } cv::rectangle(img, r, bgr, 2); // Get the size of the text cv::Size textSize = cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); // Set the top left corner of the rectangle cv::Point topLeft(r.x, r.y - textSize.height); // Set the bottom right corner of the rectangle cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); // Set the thickness of the rectangle lines int lineThickness = 2; // Draw the rectangle on the image cv::rectangle(img, topLeft, bottomRight, bgr, -1); cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); } } void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { Detection det; for (int i = 0; i < count; i++) { int basic_pos = 1 + i * bbox_element; int keep_flag = decode_ptr_host[basic_pos + 6]; if (keep_flag == 1) { det.bbox[0] = decode_ptr_host[basic_pos + 0]; det.bbox[1] = decode_ptr_host[basic_pos + 1]; det.bbox[2] = decode_ptr_host[basic_pos + 2]; det.bbox[3] = decode_ptr_host[basic_pos + 3]; det.conf = decode_ptr_host[basic_pos + 4]; det.class_id = decode_ptr_host[basic_pos + 5]; res.push_back(det); } } } void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { res_batch.resize(batch_size); int count = static_cast(*decode_ptr_host); count = count > kMaxNumOutputBbox ? kMaxNumOutputBbox : count; // std::min(count, kMaxNumOutputBbox); for (int i = 0; i < batch_size; i++) { auto& img = const_cast(img_batch[i]); process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); } } ================================================ FILE: yolov9/src/postprocess.cu ================================================ // // Created by lindsay on 23-7-17. // #include "postprocess.h" static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects) { float count = predict[0]; int position = (blockDim.x * blockIdx.x + threadIdx.x); if (position >= count) return; float* pitem = predict + 1 + position * 6; int index = atomicAdd(parray, 1); if (index >= max_objects) return; float confidence = pitem[4]; if (confidence < confidence_threshold) return; float* pout_item = parray + 1 + index * bbox_element; float left = pitem[0]; float top = pitem[1]; float right = pitem[2]; float bottom = pitem[3]; float label = pitem[5]; *pout_item++ = left; *pout_item++ = top; *pout_item++ = right; *pout_item++ = bottom; *pout_item++ = confidence; *pout_item++ = label; *pout_item++ = 1; // 1 = keep, 0 = ignore } static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { float cleft = max(aleft, bleft); float ctop = max(atop, btop); float cright = min(aright, bright); float cbottom = min(abottom, bbottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); return c_area / (a_area + b_area - c_area); } static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) { int position = (blockDim.x * blockIdx.x + threadIdx.x); int count = min(static_cast(bboxes[0]), max_objects); // float count = 0.0f; if (position >= count) return; float* pcurrent = bboxes + 1 + position * bbox_element; for (int i = 1; i < count; ++i) { float* pitem = bboxes + 1 + i * bbox_element; if (i == position || pcurrent[5] != pitem[5]) continue; if (pitem[4] >= pcurrent[4]) { if (pitem[4] == pcurrent[4] && i < position) continue; float iou = box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]); if (iou > threshold) { pcurrent[6] = 0; return; } } } } // 置信度过滤 void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, cudaStream_t stream) { int block = 256; int grid = ceil(num_bboxes / (float)block); decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); } void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) { int block = max_objects < 256 ? max_objects : 256; int grid = ceil(max_objects / (float)block); nms_kernel<<>>(parray, max_objects, nms_threshold); } ================================================ FILE: yolov9/src/preprocess.cu ================================================ #include "preprocess.h" #include "cuda_utils.h" static uint8_t* img_buffer_host = nullptr; static uint8_t* img_buffer_device = nullptr; struct AffineMatrix { float value[6]; }; __global__ void warpaffine_kernel( uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { int position = blockDim.x * blockIdx.x + threadIdx.x; if (position >= edge) return; float m_x1 = d2s.value[0]; float m_y1 = d2s.value[1]; float m_z1 = d2s.value[2]; float m_x2 = d2s.value[3]; float m_y2 = d2s.value[4]; float m_z2 = d2s.value[5]; int dx = position % dst_width; int dy = position / dst_width; float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; float c0, c1, c2; if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { // out of range c0 = const_value_st; c1 = const_value_st; c2 = const_value_st; } else { int y_low = floorf(src_y); int x_low = floorf(src_x); int y_high = y_low + 1; int x_high = x_low + 1; uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; float ly = src_y - y_low; float lx = src_x - x_low; float hy = 1 - ly; float hx = 1 - lx; float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; uint8_t* v1 = const_value; uint8_t* v2 = const_value; uint8_t* v3 = const_value; uint8_t* v4 = const_value; if (y_low >= 0) { if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3; if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3; } if (y_high < src_height) { if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3; if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3; } c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; } // bgr to rgb float t = c2; c2 = c0; c0 = t; // normalization c0 = c0 / 255.0f; c1 = c1 / 255.0f; c2 = c2 / 255.0f; // rgbrgbrgb to rrrgggbbb int area = dst_width * dst_height; float* pdst_c0 = dst + dy * dst_width + dx; float* pdst_c1 = pdst_c0 + area; float* pdst_c2 = pdst_c1 + area; *pdst_c0 = c0; *pdst_c1 = c1; *pdst_c2 = c2; } void cuda_preprocess( uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int img_size = src_width * src_height * 3; // copy data to pinned memory memcpy(img_buffer_host, src, img_size); // copy data to device memory CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); AffineMatrix s2d, d2s; float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width); s2d.value[0] = scale; s2d.value[1] = 0; s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; s2d.value[3] = 0; s2d.value[4] = scale; s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); int jobs = dst_height * dst_width; int threads = 256; int blocks = ceil(jobs / (float)threads); warpaffine_kernel<<>>( img_buffer_device, src_width * 3, src_width, src_height, dst, dst_width, dst_height, 128, d2s, jobs); } void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, cudaStream_t stream) { int dst_size = dst_width * dst_height * 3; for (size_t i = 0; i < img_batch.size(); i++) { cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, dst_height, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } } void cuda_preprocess_init(int max_image_size) { // prepare input data in pinned memory CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3)); // prepare input data in device memory CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3)); } void cuda_preprocess_destroy() { CUDA_CHECK(cudaFree(img_buffer_device)); CUDA_CHECK(cudaFreeHost(img_buffer_host)); } ================================================ FILE: yolov9/windows/dirent.h ================================================ /* * Dirent interface for Microsoft Visual Studio * * Copyright (C) 1998-2019 Toni Ronkko * This file is part of dirent. Dirent may be freely distributed * under the MIT license. For all details and documentation, see * https://github.com/tronkko/dirent */ #ifndef DIRENT_H #define DIRENT_H /* Hide warnings about unreferenced local functions */ #if defined(__clang__) # pragma clang diagnostic ignored "-Wunused-function" #elif defined(_MSC_VER) # pragma warning(disable:4505) #elif defined(__GNUC__) # pragma GCC diagnostic ignored "-Wunused-function" #endif /* * Include windows.h without Windows Sockets 1.1 to prevent conflicts with * Windows Sockets 2.0. */ #ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN #endif #include #include #include #include #include #include #include #include #include #include /* Indicates that d_type field is available in dirent structure */ #define _DIRENT_HAVE_D_TYPE /* Indicates that d_namlen field is available in dirent structure */ #define _DIRENT_HAVE_D_NAMLEN /* Entries missing from MSVC 6.0 */ #if !defined(FILE_ATTRIBUTE_DEVICE) # define FILE_ATTRIBUTE_DEVICE 0x40 #endif /* File type and permission flags for stat(), general mask */ #if !defined(S_IFMT) # define S_IFMT _S_IFMT #endif /* Directory bit */ #if !defined(S_IFDIR) # define S_IFDIR _S_IFDIR #endif /* Character device bit */ #if !defined(S_IFCHR) # define S_IFCHR _S_IFCHR #endif /* Pipe bit */ #if !defined(S_IFFIFO) # define S_IFFIFO _S_IFFIFO #endif /* Regular file bit */ #if !defined(S_IFREG) # define S_IFREG _S_IFREG #endif /* Read permission */ #if !defined(S_IREAD) # define S_IREAD _S_IREAD #endif /* Write permission */ #if !defined(S_IWRITE) # define S_IWRITE _S_IWRITE #endif /* Execute permission */ #if !defined(S_IEXEC) # define S_IEXEC _S_IEXEC #endif /* Pipe */ #if !defined(S_IFIFO) # define S_IFIFO _S_IFIFO #endif /* Block device */ #if !defined(S_IFBLK) # define S_IFBLK 0 #endif /* Link */ #if !defined(S_IFLNK) # define S_IFLNK 0 #endif /* Socket */ #if !defined(S_IFSOCK) # define S_IFSOCK 0 #endif /* Read user permission */ #if !defined(S_IRUSR) # define S_IRUSR S_IREAD #endif /* Write user permission */ #if !defined(S_IWUSR) # define S_IWUSR S_IWRITE #endif /* Execute user permission */ #if !defined(S_IXUSR) # define S_IXUSR 0 #endif /* Read group permission */ #if !defined(S_IRGRP) # define S_IRGRP 0 #endif /* Write group permission */ #if !defined(S_IWGRP) # define S_IWGRP 0 #endif /* Execute group permission */ #if !defined(S_IXGRP) # define S_IXGRP 0 #endif /* Read others permission */ #if !defined(S_IROTH) # define S_IROTH 0 #endif /* Write others permission */ #if !defined(S_IWOTH) # define S_IWOTH 0 #endif /* Execute others permission */ #if !defined(S_IXOTH) # define S_IXOTH 0 #endif /* Maximum length of file name */ #if !defined(PATH_MAX) # define PATH_MAX MAX_PATH #endif #if !defined(FILENAME_MAX) # define FILENAME_MAX MAX_PATH #endif #if !defined(NAME_MAX) # define NAME_MAX FILENAME_MAX #endif /* File type flags for d_type */ #define DT_UNKNOWN 0 #define DT_REG S_IFREG #define DT_DIR S_IFDIR #define DT_FIFO S_IFIFO #define DT_SOCK S_IFSOCK #define DT_CHR S_IFCHR #define DT_BLK S_IFBLK #define DT_LNK S_IFLNK /* Macros for converting between st_mode and d_type */ #define IFTODT(mode) ((mode) & S_IFMT) #define DTTOIF(type) (type) /* * File type macros. Note that block devices, sockets and links cannot be * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are * only defined for compatibility. These macros should always return false * on Windows. */ #if !defined(S_ISFIFO) # define S_ISFIFO(mode) (((mode) & S_IFMT) == S_IFIFO) #endif #if !defined(S_ISDIR) # define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) #endif #if !defined(S_ISREG) # define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) #endif #if !defined(S_ISLNK) # define S_ISLNK(mode) (((mode) & S_IFMT) == S_IFLNK) #endif #if !defined(S_ISSOCK) # define S_ISSOCK(mode) (((mode) & S_IFMT) == S_IFSOCK) #endif #if !defined(S_ISCHR) # define S_ISCHR(mode) (((mode) & S_IFMT) == S_IFCHR) #endif #if !defined(S_ISBLK) # define S_ISBLK(mode) (((mode) & S_IFMT) == S_IFBLK) #endif /* Return the exact length of the file name without zero terminator */ #define _D_EXACT_NAMLEN(p) ((p)->d_namlen) /* Return the maximum size of a file name */ #define _D_ALLOC_NAMLEN(p) ((PATH_MAX)+1) #ifdef __cplusplus extern "C" { #endif /* Wide-character version */ struct _wdirent { /* Always zero */ long d_ino; /* File position within stream */ long d_off; /* Structure size */ unsigned short d_reclen; /* Length of name without \0 */ size_t d_namlen; /* File type */ int d_type; /* File name */ wchar_t d_name[PATH_MAX+1]; }; typedef struct _wdirent _wdirent; struct _WDIR { /* Current directory entry */ struct _wdirent ent; /* Private file data */ WIN32_FIND_DATAW data; /* True if data is valid */ int cached; /* Win32 search handle */ HANDLE handle; /* Initial directory name */ wchar_t *patt; }; typedef struct _WDIR _WDIR; /* Multi-byte character version */ struct dirent { /* Always zero */ long d_ino; /* File position within stream */ long d_off; /* Structure size */ unsigned short d_reclen; /* Length of name without \0 */ size_t d_namlen; /* File type */ int d_type; /* File name */ char d_name[PATH_MAX+1]; }; typedef struct dirent dirent; struct DIR { struct dirent ent; struct _WDIR *wdirp; }; typedef struct DIR DIR; /* Dirent functions */ static DIR *opendir (const char *dirname); static _WDIR *_wopendir (const wchar_t *dirname); static struct dirent *readdir (DIR *dirp); static struct _wdirent *_wreaddir (_WDIR *dirp); static int readdir_r( DIR *dirp, struct dirent *entry, struct dirent **result); static int _wreaddir_r( _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result); static int closedir (DIR *dirp); static int _wclosedir (_WDIR *dirp); static void rewinddir (DIR* dirp); static void _wrewinddir (_WDIR* dirp); static int scandir (const char *dirname, struct dirent ***namelist, int (*filter)(const struct dirent*), int (*compare)(const struct dirent**, const struct dirent**)); static int alphasort (const struct dirent **a, const struct dirent **b); static int versionsort (const struct dirent **a, const struct dirent **b); /* For compatibility with Symbian */ #define wdirent _wdirent #define WDIR _WDIR #define wopendir _wopendir #define wreaddir _wreaddir #define wclosedir _wclosedir #define wrewinddir _wrewinddir /* Internal utility functions */ static WIN32_FIND_DATAW *dirent_first (_WDIR *dirp); static WIN32_FIND_DATAW *dirent_next (_WDIR *dirp); static int dirent_mbstowcs_s( size_t *pReturnValue, wchar_t *wcstr, size_t sizeInWords, const char *mbstr, size_t count); static int dirent_wcstombs_s( size_t *pReturnValue, char *mbstr, size_t sizeInBytes, const wchar_t *wcstr, size_t count); static void dirent_set_errno (int error); /* * Open directory stream DIRNAME for read and return a pointer to the * internal working area that is used to retrieve individual directory * entries. */ static _WDIR* _wopendir( const wchar_t *dirname) { _WDIR *dirp; #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) /* Desktop */ DWORD n; #else /* WinRT */ size_t n; #endif wchar_t *p; /* Must have directory name */ if (dirname == NULL || dirname[0] == '\0') { dirent_set_errno (ENOENT); return NULL; } /* Allocate new _WDIR structure */ dirp = (_WDIR*) malloc (sizeof (struct _WDIR)); if (!dirp) { return NULL; } /* Reset _WDIR structure */ dirp->handle = INVALID_HANDLE_VALUE; dirp->patt = NULL; dirp->cached = 0; /* * Compute the length of full path plus zero terminator * * Note that on WinRT there's no way to convert relative paths * into absolute paths, so just assume it is an absolute path. */ #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) /* Desktop */ n = GetFullPathNameW (dirname, 0, NULL, NULL); #else /* WinRT */ n = wcslen (dirname); #endif /* Allocate room for absolute directory name and search pattern */ dirp->patt = (wchar_t*) malloc (sizeof (wchar_t) * n + 16); if (dirp->patt == NULL) { goto exit_closedir; } /* * Convert relative directory name to an absolute one. This * allows rewinddir() to function correctly even when current * working directory is changed between opendir() and rewinddir(). * * Note that on WinRT there's no way to convert relative paths * into absolute paths, so just assume it is an absolute path. */ #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) /* Desktop */ n = GetFullPathNameW (dirname, n, dirp->patt, NULL); if (n <= 0) { goto exit_closedir; } #else /* WinRT */ wcsncpy_s (dirp->patt, n+1, dirname, n); #endif /* Append search pattern \* to the directory name */ p = dirp->patt + n; switch (p[-1]) { case '\\': case '/': case ':': /* Directory ends in path separator, e.g. c:\temp\ */ /*NOP*/; break; default: /* Directory name doesn't end in path separator */ *p++ = '\\'; } *p++ = '*'; *p = '\0'; /* Open directory stream and retrieve the first entry */ if (!dirent_first (dirp)) { goto exit_closedir; } /* Success */ return dirp; /* Failure */ exit_closedir: _wclosedir (dirp); return NULL; } /* * Read next directory entry. * * Returns pointer to static directory entry which may be overwritten by * subsequent calls to _wreaddir(). */ static struct _wdirent* _wreaddir( _WDIR *dirp) { struct _wdirent *entry; /* * Read directory entry to buffer. We can safely ignore the return value * as entry will be set to NULL in case of error. */ (void) _wreaddir_r (dirp, &dirp->ent, &entry); /* Return pointer to statically allocated directory entry */ return entry; } /* * Read next directory entry. * * Returns zero on success. If end of directory stream is reached, then sets * result to NULL and returns zero. */ static int _wreaddir_r( _WDIR *dirp, struct _wdirent *entry, struct _wdirent **result) { WIN32_FIND_DATAW *datap; /* Read next directory entry */ datap = dirent_next (dirp); if (datap) { size_t n; DWORD attr; /* * Copy file name as wide-character string. If the file name is too * long to fit in to the destination buffer, then truncate file name * to PATH_MAX characters and zero-terminate the buffer. */ n = 0; while (n < PATH_MAX && datap->cFileName[n] != 0) { entry->d_name[n] = datap->cFileName[n]; n++; } entry->d_name[n] = 0; /* Length of file name excluding zero terminator */ entry->d_namlen = n; /* File type */ attr = datap->dwFileAttributes; if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { entry->d_type = DT_CHR; } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { entry->d_type = DT_DIR; } else { entry->d_type = DT_REG; } /* Reset dummy fields */ entry->d_ino = 0; entry->d_off = 0; entry->d_reclen = sizeof (struct _wdirent); /* Set result address */ *result = entry; } else { /* Return NULL to indicate end of directory */ *result = NULL; } return /*OK*/0; } /* * Close directory stream opened by opendir() function. This invalidates the * DIR structure as well as any directory entry read previously by * _wreaddir(). */ static int _wclosedir( _WDIR *dirp) { int ok; if (dirp) { /* Release search handle */ if (dirp->handle != INVALID_HANDLE_VALUE) { FindClose (dirp->handle); } /* Release search pattern */ free (dirp->patt); /* Release directory structure */ free (dirp); ok = /*success*/0; } else { /* Invalid directory stream */ dirent_set_errno (EBADF); ok = /*failure*/-1; } return ok; } /* * Rewind directory stream such that _wreaddir() returns the very first * file name again. */ static void _wrewinddir( _WDIR* dirp) { if (dirp) { /* Release existing search handle */ if (dirp->handle != INVALID_HANDLE_VALUE) { FindClose (dirp->handle); } /* Open new search handle */ dirent_first (dirp); } } /* Get first directory entry (internal) */ static WIN32_FIND_DATAW* dirent_first( _WDIR *dirp) { WIN32_FIND_DATAW *datap; DWORD error; /* Open directory and retrieve the first entry */ dirp->handle = FindFirstFileExW( dirp->patt, FindExInfoStandard, &dirp->data, FindExSearchNameMatch, NULL, 0); if (dirp->handle != INVALID_HANDLE_VALUE) { /* a directory entry is now waiting in memory */ datap = &dirp->data; dirp->cached = 1; } else { /* Failed to open directory: no directory entry in memory */ dirp->cached = 0; datap = NULL; /* Set error code */ error = GetLastError (); switch (error) { case ERROR_ACCESS_DENIED: /* No read access to directory */ dirent_set_errno (EACCES); break; case ERROR_DIRECTORY: /* Directory name is invalid */ dirent_set_errno (ENOTDIR); break; case ERROR_PATH_NOT_FOUND: default: /* Cannot find the file */ dirent_set_errno (ENOENT); } } return datap; } /* * Get next directory entry (internal). * * Returns */ static WIN32_FIND_DATAW* dirent_next( _WDIR *dirp) { WIN32_FIND_DATAW *p; /* Get next directory entry */ if (dirp->cached != 0) { /* A valid directory entry already in memory */ p = &dirp->data; dirp->cached = 0; } else if (dirp->handle != INVALID_HANDLE_VALUE) { /* Get the next directory entry from stream */ if (FindNextFileW (dirp->handle, &dirp->data) != FALSE) { /* Got a file */ p = &dirp->data; } else { /* The very last entry has been processed or an error occurred */ FindClose (dirp->handle); dirp->handle = INVALID_HANDLE_VALUE; p = NULL; } } else { /* End of directory stream reached */ p = NULL; } return p; } /* * Open directory stream using plain old C-string. */ static DIR* opendir( const char *dirname) { struct DIR *dirp; /* Must have directory name */ if (dirname == NULL || dirname[0] == '\0') { dirent_set_errno (ENOENT); return NULL; } /* Allocate memory for DIR structure */ dirp = (DIR*) malloc (sizeof (struct DIR)); if (!dirp) { return NULL; } { int error; wchar_t wname[PATH_MAX + 1]; size_t n; /* Convert directory name to wide-character string */ error = dirent_mbstowcs_s( &n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1); if (error) { /* * Cannot convert file name to wide-character string. This * occurs if the string contains invalid multi-byte sequences or * the output buffer is too small to contain the resulting * string. */ goto exit_free; } /* Open directory stream using wide-character name */ dirp->wdirp = _wopendir (wname); if (!dirp->wdirp) { goto exit_free; } } /* Success */ return dirp; /* Failure */ exit_free: free (dirp); return NULL; } /* * Read next directory entry. */ static struct dirent* readdir( DIR *dirp) { struct dirent *entry; /* * Read directory entry to buffer. We can safely ignore the return value * as entry will be set to NULL in case of error. */ (void) readdir_r (dirp, &dirp->ent, &entry); /* Return pointer to statically allocated directory entry */ return entry; } /* * Read next directory entry into called-allocated buffer. * * Returns zero on success. If the end of directory stream is reached, then * sets result to NULL and returns zero. */ static int readdir_r( DIR *dirp, struct dirent *entry, struct dirent **result) { WIN32_FIND_DATAW *datap; /* Read next directory entry */ datap = dirent_next (dirp->wdirp); if (datap) { size_t n; int error; /* Attempt to convert file name to multi-byte string */ error = dirent_wcstombs_s( &n, entry->d_name, PATH_MAX + 1, datap->cFileName, PATH_MAX + 1); /* * If the file name cannot be represented by a multi-byte string, * then attempt to use old 8+3 file name. This allows traditional * Unix-code to access some file names despite of unicode * characters, although file names may seem unfamiliar to the user. * * Be ware that the code below cannot come up with a short file * name unless the file system provides one. At least * VirtualBox shared folders fail to do this. */ if (error && datap->cAlternateFileName[0] != '\0') { error = dirent_wcstombs_s( &n, entry->d_name, PATH_MAX + 1, datap->cAlternateFileName, PATH_MAX + 1); } if (!error) { DWORD attr; /* Length of file name excluding zero terminator */ entry->d_namlen = n - 1; /* File attributes */ attr = datap->dwFileAttributes; if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) { entry->d_type = DT_CHR; } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) { entry->d_type = DT_DIR; } else { entry->d_type = DT_REG; } /* Reset dummy fields */ entry->d_ino = 0; entry->d_off = 0; entry->d_reclen = sizeof (struct dirent); } else { /* * Cannot convert file name to multi-byte string so construct * an erroneous directory entry and return that. Note that * we cannot return NULL as that would stop the processing * of directory entries completely. */ entry->d_name[0] = '?'; entry->d_name[1] = '\0'; entry->d_namlen = 1; entry->d_type = DT_UNKNOWN; entry->d_ino = 0; entry->d_off = -1; entry->d_reclen = 0; } /* Return pointer to directory entry */ *result = entry; } else { /* No more directory entries */ *result = NULL; } return /*OK*/0; } /* * Close directory stream. */ static int closedir( DIR *dirp) { int ok; if (dirp) { /* Close wide-character directory stream */ ok = _wclosedir (dirp->wdirp); dirp->wdirp = NULL; /* Release multi-byte character version */ free (dirp); } else { /* Invalid directory stream */ dirent_set_errno (EBADF); ok = /*failure*/-1; } return ok; } /* * Rewind directory stream to beginning. */ static void rewinddir( DIR* dirp) { /* Rewind wide-character string directory stream */ _wrewinddir (dirp->wdirp); } /* * Scan directory for entries. */ static int scandir( const char *dirname, struct dirent ***namelist, int (*filter)(const struct dirent*), int (*compare)(const struct dirent**, const struct dirent**)) { struct dirent **files = NULL; size_t size = 0; size_t allocated = 0; const size_t init_size = 1; DIR *dir = NULL; struct dirent *entry; struct dirent *tmp = NULL; size_t i; int result = 0; /* Open directory stream */ dir = opendir (dirname); if (dir) { /* Read directory entries to memory */ while (1) { /* Enlarge pointer table to make room for another pointer */ if (size >= allocated) { void *p; size_t num_entries; /* Compute number of entries in the enlarged pointer table */ if (size < init_size) { /* Allocate initial pointer table */ num_entries = init_size; } else { /* Double the size */ num_entries = size * 2; } /* Allocate first pointer table or enlarge existing table */ p = realloc (files, sizeof (void*) * num_entries); if (p != NULL) { /* Got the memory */ files = (dirent**) p; allocated = num_entries; } else { /* Out of memory */ result = -1; break; } } /* Allocate room for temporary directory entry */ if (tmp == NULL) { tmp = (struct dirent*) malloc (sizeof (struct dirent)); if (tmp == NULL) { /* Cannot allocate temporary directory entry */ result = -1; break; } } /* Read directory entry to temporary area */ if (readdir_r (dir, tmp, &entry) == /*OK*/0) { /* Did we get an entry? */ if (entry != NULL) { int pass; /* Determine whether to include the entry in result */ if (filter) { /* Let the filter function decide */ pass = filter (tmp); } else { /* No filter function, include everything */ pass = 1; } if (pass) { /* Store the temporary entry to pointer table */ files[size++] = tmp; tmp = NULL; /* Keep up with the number of files */ result++; } } else { /* * End of directory stream reached => sort entries and * exit. */ qsort (files, size, sizeof (void*), (int (*) (const void*, const void*)) compare); break; } } else { /* Error reading directory entry */ result = /*Error*/ -1; break; } } } else { /* Cannot open directory */ result = /*Error*/ -1; } /* Release temporary directory entry */ free (tmp); /* Release allocated memory on error */ if (result < 0) { for (i = 0; i < size; i++) { free (files[i]); } free (files); files = NULL; } /* Close directory stream */ if (dir) { closedir (dir); } /* Pass pointer table to caller */ if (namelist) { *namelist = files; } return result; } /* Alphabetical sorting */ static int alphasort( const struct dirent **a, const struct dirent **b) { return strcoll ((*a)->d_name, (*b)->d_name); } /* Sort versions */ static int versionsort( const struct dirent **a, const struct dirent **b) { /* FIXME: implement strverscmp and use that */ return alphasort (a, b); } /* Convert multi-byte string to wide character string */ static int dirent_mbstowcs_s( size_t *pReturnValue, wchar_t *wcstr, size_t sizeInWords, const char *mbstr, size_t count) { int error; #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 or later */ error = mbstowcs_s (pReturnValue, wcstr, sizeInWords, mbstr, count); #else /* Older Visual Studio or non-Microsoft compiler */ size_t n; /* Convert to wide-character string (or count characters) */ n = mbstowcs (wcstr, mbstr, sizeInWords); if (!wcstr || n < count) { /* Zero-terminate output buffer */ if (wcstr && sizeInWords) { if (n >= sizeInWords) { n = sizeInWords - 1; } wcstr[n] = 0; } /* Length of resulting multi-byte string WITH zero terminator */ if (pReturnValue) { *pReturnValue = n + 1; } /* Success */ error = 0; } else { /* Could not convert string */ error = 1; } #endif return error; } /* Convert wide-character string to multi-byte string */ static int dirent_wcstombs_s( size_t *pReturnValue, char *mbstr, size_t sizeInBytes, /* max size of mbstr */ const wchar_t *wcstr, size_t count) { int error; #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 or later */ error = wcstombs_s (pReturnValue, mbstr, sizeInBytes, wcstr, count); #else /* Older Visual Studio or non-Microsoft compiler */ size_t n; /* Convert to multi-byte string (or count the number of bytes needed) */ n = wcstombs (mbstr, wcstr, sizeInBytes); if (!mbstr || n < count) { /* Zero-terminate output buffer */ if (mbstr && sizeInBytes) { if (n >= sizeInBytes) { n = sizeInBytes - 1; } mbstr[n] = '\0'; } /* Length of resulting multi-bytes string WITH zero-terminator */ if (pReturnValue) { *pReturnValue = n + 1; } /* Success */ error = 0; } else { /* Cannot convert string */ error = 1; } #endif return error; } /* Set errno variable */ static void dirent_set_errno( int error) { #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Microsoft Visual Studio 2005 and later */ _set_errno (error); #else /* Non-Microsoft compiler or older Microsoft compiler */ errno = error; #endif } #ifdef __cplusplus } #endif #endif /*DIRENT_H*/ ================================================ FILE: yolov9/yolov9_trt.py ================================================ """ An example that uses TensorRT's Python api to make inferences. """ import ctypes import os import shutil import random import sys import threading import time import cv2 import numpy as np import pycuda.autoinit # noqa: F401 import pycuda.driver as cuda import tensorrt as trt CONF_THRESH = 0.5 IOU_THRESHOLD = 0.4 def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret def plot_one_box(x, img, color=None, label=None, line_thickness=None): """ description: Plots one bounding box on image img, this function comes from yolov9 project. param: x: a box likes [x1,y1,x2,y2] img: a opencv image object color: color to draw rectangle, such as (0,255,0) label: str line_thickness: int return: no return """ tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 ) # line/font thickness color = color or [random.randint(0, 255) for _ in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText( img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA, ) class yolov9TRT(object): """ description: A yolov9 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_origin_h = [] batch_origin_w = [] batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) batch_image_raw.append(image_raw) batch_origin_h.append(origin_h) batch_origin_w.append(origin_w) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): result_boxes, result_scores, result_classid = self.post_process( output[i * 38001: (i + 1) * 38001], batch_origin_h[i], batch_origin_w[i] ) # Draw rectangles and labels on the original image for j in range(len(result_boxes)): box = result_boxes[j] plot_one_box( box, batch_image_raw[i], label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j] ), ) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_image(self, raw_bgr_image): """ description: Convert BGR image to RGB, resize and pad it to target size, normalize to [0,1], transform to NCHW format. param: input_image_path: str, image path return: image: the processed image image_raw: the original image h: original height w: original width """ image_raw = raw_bgr_image h, w, c = image_raw.shape image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) # Calculate widht and height and paddings r_w = self.input_w / w r_h = self.input_h / h if r_h > r_w: tw = self.input_w th = int(r_w * h) tx1 = tx2 = 0 ty1 = int((self.input_h - th) / 2) ty2 = self.input_h - th - ty1 else: tw = int(r_h * w) th = self.input_h tx1 = int((self.input_w - tw) / 2) tx2 = self.input_w - tw - tx1 ty1 = ty2 = 0 # Resize the image with long side while maintaining ratio image = cv2.resize(image, (tw, th)) # Pad the short side with (128,128,128) image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) ) image = image.astype(np.float32) # Normalize to [0,1] image /= 255.0 # HWC to CHW format: image = np.transpose(image, [2, 0, 1]) # CHW to NCHW format image = np.expand_dims(image, axis=0) # Convert the image to row-major order, also known as "C order": image = np.ascontiguousarray(image) return image, image_raw, h, w def xywh2xyxy(self, origin_h, origin_w, x): """ description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right param: origin_h: height of original image origin_w: width of original image x: A boxes numpy, each row is a box [center_x, center_y, w, h] return: y: A boxes numpy, each row is a box [x1, y1, x2, y2] """ y = np.zeros_like(x) r_w = self.input_w / origin_w r_h = self.input_h / origin_h if r_h > r_w: y[:, 0] = x[:, 0] y[:, 2] = x[:, 2] y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 y /= r_w else: y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 y[:, 1] = x[:, 1] y[:, 3] = x[:, 3] y /= r_h return y def post_process(self, output, origin_h, origin_w): """ description: postprocess the prediction param: output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] origin_h: height of original image origin_w: width of original image return: result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] result_scores: finally scores, a numpy, each element is the score correspoing to box result_classid: finally classid, a numpy, each element is the classid correspoing to box """ # Get the num of boxes detected num = int(output[0]) # Reshape to a two dimentional ndarray pred = np.reshape(output[1:], (-1, 38))[:num, :] # Do nms boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) result_boxes = boxes[:, :4] if len(boxes) else np.array([]) result_scores = boxes[:, 4] if len(boxes) else np.array([]) result_classid = boxes[:, 5] if len(boxes) else np.array([]) return result_boxes, result_scores, result_classid def bbox_iou(self, box1, box2, x1y1x2y2=True): """ description: compute the IoU of two bounding boxes param: box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) x1y1x2y2: select the coordinate format return: iou: computed iou """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # Get the coordinates of the intersection rectangle inter_rect_x1 = np.maximum(b1_x1, b2_x1) inter_rect_y1 = np.maximum(b1_y1, b2_y1) inter_rect_x2 = np.minimum(b1_x2, b2_x2) inter_rect_y2 = np.minimum(b1_y2, b2_y2) # Intersection area inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): """ description: Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. param: prediction: detections, (x1, y1, x2, y2, conf, cls_id) origin_h: original image height origin_w: original image width conf_thres: a confidence threshold to filter detections nms_thres: a iou threshold to filter detections return: boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) """ # Get the boxes that score > CONF_THRESH boxes = prediction[prediction[:, 4] >= conf_thres] # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) # clip the coordinates boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) # Object confidence confs = boxes[:, 4] # Sort by the confs boxes = boxes[np.argsort(-confs)] # Perform non-maximum suppression keep_boxes = [] while boxes.shape[0]: large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres label_match = boxes[0, -1] == boxes[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match keep_boxes += [boxes[0]] boxes = boxes[~invalid] boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) return boxes class inferThread(threading.Thread): def __init__(self, yolov9_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov9_wrapper = yolov9_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov9_wrapper.infer(self.yolov9_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov9_wrapper): threading.Thread.__init__(self) self.yolov9_wrapper = yolov9_wrapper def run(self): batch_image_raw, use_time = self.yolov9_wrapper.infer(self.yolov9_wrapper.get_raw_image_zeros()) print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine PLUGIN_LIBRARY = "build/libmyplugins.so" engine_file_path = "yolov9-c.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if len(sys.argv) > 2: PLUGIN_LIBRARY = sys.argv[2] ctypes.CDLL(PLUGIN_LIBRARY) # load coco labels categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a yolov9TRT instance yolov9_wrapper = yolov9TRT(engine_file_path) try: print('batch size is', yolov9_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches(yolov9_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov9_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov9_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov9_wrapper.destroy()